In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
data = pd.read_csv('mpg_data.csv')

In [None]:
data.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,1,ford torino


In [None]:
data.columns

Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'model year', 'origin', 'car name'],
      dtype='object')

In [None]:
data.isnull().any()

mpg             False
cylinders       False
displacement    False
horsepower       True
weight          False
acceleration    False
model year      False
origin          False
car name        False
dtype: bool

In [None]:
data['horsepower'].fillna((data['horsepower'].mean()), inplace=True)

In [None]:
data.describe()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
count,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,104.469388,2970.424623,15.56809,76.01005,1.572864
std,7.815984,1.701004,104.269838,38.199187,846.841774,2.757689,3.697627,0.802055
min,9.0,3.0,68.0,46.0,1613.0,8.0,70.0,1.0
25%,17.5,4.0,104.25,76.0,2223.75,13.825,73.0,1.0
50%,23.0,4.0,148.5,95.0,2803.5,15.5,76.0,1.0
75%,29.0,8.0,262.0,125.0,3608.0,17.175,79.0,2.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,82.0,3.0


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
factors = ['cylinders','displacement','horsepower','acceleration','weight','origin','model year']
X = pd.DataFrame(data[factors].copy())
y = data['mpg'].copy()

In [None]:
scaler = StandardScaler()

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size = 0.33,random_state=324)
X_train.shape[0] == y_train.shape[0]

True

In [None]:
scaler.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [None]:
X_train_scaled = scaler.transform(X_train)

In [None]:
model = LinearRegression()

In [None]:
model.fit(X_train_scaled,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [None]:
from pickle import dump

In [None]:
dump(model, open('model.pkl', 'wb'))
dump(scaler, open('scaler.pkl', 'wb'))

In [None]:
from pickle import load

In [None]:
model = load(open('model.pkl', 'rb'))
# load the scaler
scaler = load(open('scaler.pkl', 'rb'))

In [None]:
X_test_scaled = scaler.transform(X_test)

In [None]:
X_test

Unnamed: 0,cylinders,displacement,horsepower,acceleration,weight,origin,model year
299,4,141.0,71.0,24.8,3190,2,79
73,8,307.0,130.0,14.0,4098,1,72
155,6,250.0,72.0,19.5,3158,1,75
164,6,231.0,110.0,15.0,3039,1,75
85,8,350.0,175.0,13.0,4100,1,73
...,...,...,...,...,...,...,...
103,8,400.0,150.0,14.0,4997,1,73
283,6,232.0,90.0,18.2,3265,1,79
74,8,302.0,140.0,16.0,4294,1,72
59,4,97.0,54.0,23.5,2254,2,72


In [None]:
yhat = model.predict(X_test_scaled)
yhat

array([25.80062558, 12.5951797 , 22.97728027, 21.89444048, 13.1669414 ,
       32.45238551, 13.50991598,  8.88690757, 10.2865107 , 18.98150144,
       28.44956837, 27.78851612, 20.81273017, 22.84844919, 11.6363639 ,
       27.07250556, 34.78722995, 29.96917443, 23.7155859 , 22.11071662,
       22.04116578, 21.24174918, 28.95464125, 31.86526355,  6.04799024,
        8.25085687, 31.44146104, 15.60212031,  9.67352462, 26.39410179,
       14.10632933, 25.21341076, 31.31580079, 16.50301389, 21.04591782,
       32.07042284, 22.36356233, 25.34462989, 18.73258154, 30.77170994,
       22.51998416, 21.08575631, 17.68921667, 30.50730495,  5.1199258 ,
       24.81060424, 15.24145069, 10.54357534, 34.50320733, 28.63604471,
       20.08119019,  7.03188344, 20.30346514, 29.66061095, 30.11557622,
       25.19658046, 26.55841371, 33.35738291,  8.27711951, 25.56535155,
       32.37047385, 33.83440046, 32.743785  , 29.50695203, 15.21268305,
       22.95975948, 24.94918489, 24.15275574, 33.204147  , 28.42

In [None]:
mean_squared_error(y_test,yhat)

11.105454122887837

In [None]:
val = {'cylinders' : 4,
       'displacement' : 141,
       'horsepower' : 71,
       'acceleration' : 24.8,
       'weight' : 3190,
       'origin' : 2,
       'model year' : 79        
}

In [None]:
d = pd.DataFrame(val, columns = ['cylinders',	'displacement',	'horsepower',	'acceleration', 'weight',	'origin',	'model year'],index=[0])
d

Unnamed: 0,cylinders,displacement,horsepower,acceleration,weight,origin,model year
0,4,141,71,24.8,3190,2,79


In [None]:
d_scaled = scaler.transform(d)

In [None]:
model.predict(d_scaled)

array([25.80062558])

In [None]:
d_scaled

array([[-0.82049435, -0.46098959, -0.87881366,  3.5061541 ,  0.33221903,
         0.47616332,  0.76139117]])