In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Reading the csv file.

df0=pd.read_csv('auto-mpg.data-original', delim_whitespace=True, names=('mpg','cylinders','displacement','horsepower','weight','acceleration','model_year','origin','car_name'))
#names=('mpg','cylinders','displacement','horsepower','weight','acceleration','model_year','origin','car_name'))
df0.head()

In [None]:
df0.shape

In [None]:
df0.describe()

In [None]:
df0.info()

In [None]:
# Checking which all columns have null values

df0.isnull().sum().sort_values(ascending=False)

In [None]:
# Fill the nan or nulls

df0['mpg'].fillna(df0['mpg'].median(), inplace=True)
df0['horsepower'].fillna(df0['horsepower'].median(), inplace=True)

In [None]:
# Checking for null using HeatMap
sns.heatmap(df0.isnull())

In [None]:
# Encoding columns as part of transformation.

from sklearn.preprocessing import LabelEncoder 
le= LabelEncoder()

#df = le.fit_transform(df0)
df0['car_name'] = le.fit_transform(df0['car_name'])


In [None]:
df0.info()

In [None]:
plt.figure(figsize=(20,10)) 
sns.heatmap(df0.corr(), annot=True, fmt=".2f") 
plt.suptitle("Correlation Map", fontsize=18)
plt.show()  

In [None]:
# Checking the zscore in order to normalize the data.

from scipy.stats import zscore
z= np.abs(zscore(df0))
z

In [None]:
# Verifying for the presence of zscore value of data with threshold of more than 3 std score.

threshold=3
print(np.where(z>3))

In [None]:
# Removing the outliers having zscore value of more than 3.
data=df0[(z<3).all(axis=1)]
data

Removed 6 records from the original data of 406 set where zscore was more than 3 and created a new set.

In [None]:
# Checking for skewed data
data.skew()

In [None]:
# Setting x  by excluding AveragePrice column which is y here for prediction.
x=data.drop(['mpg'],axis=1)
x

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

scale = StandardScaler()
#scale = MinMaxScaler()
x=scale.fit_transform(x)

In [None]:
# Settng Y

y=data['mpg']
y

In [None]:

max_score=0
for r_state in range(42,101):
    train_x,test_x,train_y,test_y=train_test_split(x,y,test_size=.25,random_state=r_state)
    lm=LinearRegression()
    lm.fit(train_x,train_y)
    pred=lm.predict(test_x)
    score=lm.score(x,y)
    #print("Score corresponding to r_state: ",r_state," is ",score)
    if(score>max_score):
        max_score=score
        final_rstate=r_state
        
print("\n")
print("Max_accuracy_Score corresponding to final_r_state: ",final_rstate," is ",max_score)

In [None]:
# Setting the test x & y values and using the random state from above step which is 51.

train_x,test_x,train_y,test_y=train_test_split(x,y,test_size=.25,random_state=51)

In [None]:
train_x.shape

In [None]:
train_y.shape

In [None]:
test_x.shape

In [None]:
test_y.shape

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Lasso,Ridge,ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
KNN=KNeighborsRegressor(n_neighbors=10)
SV=SVR()
LR=LinearRegression()
DT=DecisionTreeRegressor(random_state=10)
LS = Lasso(alpha=0.001)
RD = Ridge(alpha=0.01)
EL = ElasticNet(alpha=0.001)
RF = RandomForestRegressor(n_estimators=200,random_state=92)
AD = AdaBoostRegressor()
GB = GradientBoostingRegressor()

In [None]:
models=[]
models.append(('KNeighborsRegressor',KNN))
models.append(('SVR',SV))
models.append(('LinearRegression',LR))
models.append(('DecisionTreeRegressor',DT))
models.append(('Lasso',LS))
models.append(('Ridge',RD))
models.append(('ElasticNet',EL))
models.append(('RandomForestRegressor',RF))
models.append(('AdaBoostRegressor',AD))
models.append(('GradientBoostingRegressor',GB))

In [None]:
Model=[]
score=[]
mae=[]
mse=[]
rmae=[]

for name,model in models:
    print("--------------",name,"--------------")
    Model.append(name)
    model.fit(train_x,train_y)
    print(model)
    pre=model.predict(test_x)
    
    # Metrics
    m1=mean_absolute_error(test_y,pred)
    print("Mean absolute error",m1)
    mae.append(m1)
           
    m2=mean_squared_error(test_y,pred)
    print("Mean squared error",m2)
    mse.append(m2)
    
    
    m3=np.sqrt(mean_squared_error(test_y,pred))
    print("Root Mean absolute error",m3)
    rmae.append(m3)
    
    #Model performance
    modelscore=model.score(test_x,test_y)
    print("Score: ",modelscore )
    score.append(modelscore*100)
    
    print("\n")

In [None]:
result=pd.DataFrame({"Model": Model, "Score": score})
result

# From the above table RandomForestRegressor & GradientBoostingRegressor seems to be the models with over 86% accuracy.

Selecting the best model using GridSearchCV check

In [None]:
from sklearn.model_selection import GridSearchCV

#parameters 
params={'n_estimators':[100,500],'random_state':[10,100]}


rf= GridSearchCV(RandomForestRegressor(),param_grid=params)
rf.fit(train_x,train_y)
print("Best parameters for RandomForest: ",rf.best_params_)

gb= GridSearchCV(GradientBoostingRegressor(),param_grid=params)
gb.fit(train_x,train_y)
print("Best parameters for GradientBoostingRegressor: ",gb.best_params_)


In [None]:
RF = RandomForestRegressor(n_estimators=500,random_state=10)
GB = GradientBoostingRegressor(n_estimators=100,random_state=100)

In [None]:
models=[]

models.append(('RandomForestRegressor',RF))
models.append(('GradientBoostingRegressor',GB))

In [None]:
Model=[]
score=[]
mae=[]
mse=[]
rmae=[]

for name,model in models:
    print("--------------",name,"--------------")
    Model.append(name)
    model.fit(train_x,train_y)
    print(model)
    pre=model.predict(test_x)
    
    # Metrics
    m1=mean_absolute_error(test_y,pred)
    print("Mean absolute error",m1)
    mae.append(m1)
           
    m2=mean_squared_error(test_y,pred)
    print("Mean squared error",m2)
    mse.append(m2)
    
    
    m3=np.sqrt(mean_squared_error(test_y,pred))
    print("Root Mean absolute error",m3)
    rmae.append(m3)
    
    #Model performance
    modelscore=model.score(test_x,test_y)
    print("Score: ",modelscore )
    score.append(modelscore*100)
    
    print("\n")

In [None]:
result=pd.DataFrame({"Model": Model, "Score": score})
result

# GradientBoostingRegressor is the best fit model with better accuracy of 86.7%

In [None]:
# Saving the prediction data in a file.

predictData=pd.DataFrame(pre)
data.to_csv('AutoMPG_Predict.csv')
predictData

In [None]:
# Saving the model

from sklearn.externals import joblib
joblib.dump(GB,"GB_AutoMPG.pkl")