In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from warnings import filterwarnings
filterwarnings('ignore')

%matplotlib inline

In [2]:
df = pd.read_csv('car-mpg.csv')
df.head()

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,yr,origin,car_type,car_name
0,18.0,8,307.0,130,3504,12.0,70,1,0,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,0,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,0,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,0,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,0,ford torino


In [3]:
df.drop(['car_name', 'yr'], axis = 1, inplace = True)

In [4]:
df[df.hp.str.isdigit() == False]

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,origin,car_type
32,25.0,4,98.0,?,2046,19.0,1,1
126,21.0,6,200.0,?,2875,17.0,1,0
330,40.9,4,85.0,?,1835,17.3,2,1
336,23.6,4,140.0,?,2905,14.3,1,1
354,34.5,4,100.0,?,2320,15.8,2,1
374,23.0,4,151.0,?,3035,20.5,1,1


In [5]:
df.hp.replace('?', np.nan, inplace = True)
df = df.apply(lambda x: x.fillna(x.median()),axis=0)
df.hp = df.hp.astype('float64')
df.tail()

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,origin,car_type
393,27.0,4,140.0,86.0,2790,15.6,1,1
394,44.0,4,97.0,52.0,2130,24.6,2,1
395,32.0,4,135.0,84.0,2295,11.6,1,1
396,28.0,4,120.0,79.0,2625,18.6,1,1
397,31.0,4,119.0,82.0,2720,19.4,1,1


In [6]:
df.origin.value_counts()

1    249
3     79
2     70
Name: origin, dtype: int64

In [7]:
X = df.drop('mpg', axis = 1)
y = df['mpg']

In [8]:
# Encoding categorical data
from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder(categorical_features = [5,6])
X = onehotencoder.fit_transform(X).toarray()



In [9]:
X = pd.DataFrame(X, columns =['origin_1', 'origin_2', 'origin_3', 'car_type_0', 'car_type_1', 'cyl', 'disp', 'hp', 'wt', 'acc'])
X.drop('origin_1', axis = 1, inplace = True)
X.tail()

Unnamed: 0,origin_2,origin_3,car_type_0,car_type_1,cyl,disp,hp,wt,acc
393,0.0,0.0,0.0,1.0,4.0,140.0,86.0,2790.0,15.6
394,1.0,0.0,0.0,1.0,4.0,97.0,52.0,2130.0,24.6
395,0.0,0.0,0.0,1.0,4.0,135.0,84.0,2295.0,11.6
396,0.0,0.0,0.0,1.0,4.0,120.0,79.0,2625.0,18.6
397,0.0,0.0,0.0,1.0,4.0,119.0,82.0,2720.0,19.4


In [10]:
X.shape

(398, 9)

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1/3, random_state = 0)

In [12]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [13]:
y_pred = regressor.predict(X_test)

In [14]:
import statsmodels.formula.api as sm

#X = np.append(arr = np.ones((50, 1)).astype(int), values = X, axis = 1)
X['intercept'] = np.ones((398, 1)).astype(int) 
X = X[['intercept', 'origin_2', 'origin_3', 'car_type_0', 'car_type_1', 'cyl', 'disp', 'hp',
       'wt', 'acc']]
X_opt = X.iloc[:, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,mpg,R-squared:,0.744
Model:,OLS,Adj. R-squared:,0.738
Method:,Least Squares,F-statistic:,141.0
Date:,"Tue, 10 Dec 2019",Prob (F-statistic):,5.52e-110
Time:,19:07:14,Log-Likelihood:,-1111.7
No. Observations:,398,AIC:,2241.0
Df Residuals:,389,BIC:,2277.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,23.2911,1.959,11.889,0.000,19.439,27.143
origin_2,0.5280,0.679,0.778,0.437,-0.806,1.862
origin_3,2.3449,0.667,3.515,0.000,1.033,3.657
car_type_0,8.3368,1.336,6.241,0.000,5.711,10.963
car_type_1,14.9544,0.837,17.858,0.000,13.308,16.601
cyl,1.4324,0.499,2.873,0.004,0.452,2.413
disp,0.0085,0.009,0.925,0.356,-0.010,0.027
hp,-0.0773,0.016,-4.780,0.000,-0.109,-0.046
wt,-0.0045,0.001,-5.847,0.000,-0.006,-0.003

0,1,2,3
Omnibus:,47.824,Durbin-Watson:,0.976
Prob(Omnibus):,0.0,Jarque-Bera (JB):,77.08
Skew:,0.754,Prob(JB):,1.83e-17
Kurtosis:,4.541,Cond. No.,1.04e+19


In [15]:
X_opt = X.iloc[:, [0, 1, 2, 3, 4, 5, 6, 7, 8]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,mpg,R-squared:,0.744
Model:,OLS,Adj. R-squared:,0.739
Method:,Least Squares,F-statistic:,161.6
Date:,"Tue, 10 Dec 2019",Prob (F-statistic):,4.21e-111
Time:,19:07:15,Log-Likelihood:,-1111.7
No. Observations:,398,AIC:,2239.0
Df Residuals:,390,BIC:,2271.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,23.1296,1.417,16.327,0.000,20.344,25.915
origin_2,0.5267,0.678,0.777,0.438,-0.806,1.859
origin_3,2.3458,0.666,3.521,0.000,1.036,3.656
car_type_0,8.2562,1.152,7.168,0.000,5.992,10.521
car_type_1,14.8734,0.492,30.254,0.000,13.907,15.840
cyl,1.4343,0.498,2.882,0.004,0.456,2.413
disp,0.0086,0.009,0.942,0.347,-0.009,0.027
hp,-0.0762,0.013,-5.843,0.000,-0.102,-0.051
wt,-0.0045,0.001,-6.635,0.000,-0.006,-0.003

0,1,2,3
Omnibus:,47.356,Durbin-Watson:,0.975
Prob(Omnibus):,0.0,Jarque-Bera (JB):,76.076
Skew:,0.749,Prob(JB):,3.02e-17
Kurtosis:,4.531,Cond. No.,1.05e+19


In [16]:
X_opt = X.iloc[:, [0, 2, 3, 4, 5, 6, 7, 8]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,mpg,R-squared:,0.743
Model:,OLS,Adj. R-squared:,0.739
Method:,Least Squares,F-statistic:,188.6
Date:,"Tue, 10 Dec 2019",Prob (F-statistic):,3.97e-112
Time:,19:07:15,Log-Likelihood:,-1112.0
No. Observations:,398,AIC:,2238.0
Df Residuals:,391,BIC:,2266.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,23.0835,1.415,16.317,0.000,20.302,25.865
origin_3,2.0827,0.574,3.631,0.000,0.955,3.211
car_type_0,8.1486,1.143,7.130,0.000,5.902,10.396
car_type_1,14.9349,0.485,30.796,0.000,13.981,15.888
cyl,1.5008,0.490,3.063,0.002,0.537,2.464
disp,0.0062,0.009,0.724,0.469,-0.011,0.023
hp,-0.0746,0.013,-5.797,0.000,-0.100,-0.049
wt,-0.0045,0.001,-6.596,0.000,-0.006,-0.003

0,1,2,3
Omnibus:,51.203,Durbin-Watson:,0.985
Prob(Omnibus):,0.0,Jarque-Bera (JB):,85.794
Skew:,0.783,Prob(JB):,2.34e-19
Kurtosis:,4.65,Cond. No.,1.54e+19


In [17]:
X_opt = X.iloc[:, [0, 2, 3, 4, 5, 7, 8]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,mpg,R-squared:,0.743
Model:,OLS,Adj. R-squared:,0.74
Method:,Least Squares,F-statistic:,226.5
Date:,"Tue, 10 Dec 2019",Prob (F-statistic):,3.25e-113
Time:,19:07:15,Log-Likelihood:,-1112.3
No. Observations:,398,AIC:,2237.0
Df Residuals:,392,BIC:,2261.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,22.5450,1.203,18.745,0.000,20.180,24.910
origin_3,2.0110,0.565,3.561,0.000,0.901,3.121
car_type_0,7.8889,1.084,7.274,0.000,5.757,10.021
car_type_1,14.6561,0.295,49.738,0.000,14.077,15.235
cyl,1.6865,0.417,4.041,0.000,0.866,2.507
hp,-0.0705,0.012,-6.083,0.000,-0.093,-0.048
wt,-0.0043,0.001,-6.900,0.000,-0.006,-0.003

0,1,2,3
Omnibus:,51.769,Durbin-Watson:,0.979
Prob(Omnibus):,0.0,Jarque-Bera (JB):,86.903
Skew:,0.789,Prob(JB):,1.35e-19
Kurtosis:,4.658,Cond. No.,9.23e+18
