In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [2]:
automobile_data = pd.read_csv('datasets/CarPrice_Assignment.csv')
automobile_data.head()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


In [3]:
automobile_data.shape

(205, 26)

In [4]:
automobile_data.columns

Index(['car_ID', 'symboling', 'CarName', 'fueltype', 'aspiration',
       'doornumber', 'carbody', 'drivewheel', 'enginelocation', 'wheelbase',
       'carlength', 'carwidth', 'carheight', 'curbweight', 'enginetype',
       'cylindernumber', 'enginesize', 'fuelsystem', 'boreratio', 'stroke',
       'compressionratio', 'horsepower', 'peakrpm', 'citympg', 'highwaympg',
       'price'],
      dtype='object')

In [5]:
automobile_data.drop(['CarName','car_ID','symboling'],axis=1, inplace=True)

In [6]:
automobile_data = pd.get_dummies(automobile_data)

In [7]:
automobile_data.head()

Unnamed: 0,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,horsepower,...,cylindernumber_twelve,cylindernumber_two,fuelsystem_1bbl,fuelsystem_2bbl,fuelsystem_4bbl,fuelsystem_idi,fuelsystem_mfi,fuelsystem_mpfi,fuelsystem_spdi,fuelsystem_spfi
0,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111,...,0,0,0,0,0,0,0,1,0,0
1,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111,...,0,0,0,0,0,0,0,1,0,0
2,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,9.0,154,...,0,0,0,0,0,0,0,1,0,0
3,99.8,176.6,66.2,54.3,2337,109,3.19,3.4,10.0,102,...,0,0,0,0,0,0,0,1,0,0
4,99.4,176.6,66.4,54.3,2824,136,3.19,3.4,8.0,115,...,0,0,0,0,0,0,0,1,0,0


In [8]:
X = automobile_data.drop('price', axis=1)
Y = automobile_data['price']

In [9]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [10]:
model = LinearRegression().fit(x_train, y_train)

In [11]:
Training_score = model.score(x_train, y_train)

In [12]:
Training_score

0.9502380107101203

In [13]:
y_pred = model.predict(x_test)

In [14]:
r2_score(y_test, y_pred)

0.8538245139756244

# To serialize ----

In [15]:
import json

In [16]:
model.coef_

array([ 5.11024986e+01, -7.19698260e+01,  7.60223154e+02, -4.26135157e+01,
        3.97321758e+00,  1.00167843e+02, -1.71801309e+03, -4.39796040e+03,
       -1.10949276e+03,  5.75279951e+00,  2.15956212e+00, -6.74796159e+01,
        8.25363369e+01,  5.44318947e+03, -5.44318947e+03, -5.00078644e+02,
        5.00078644e+02, -1.21802973e+02,  1.21802973e+02,  2.95340051e+03,
       -2.34931533e+03, -6.47362202e+02,  4.05178632e+02, -3.61901605e+02,
       -8.44286120e+02, -3.84862564e+02,  1.22914868e+03, -3.96020545e+03,
        3.96020545e+03, -2.43907382e+02, -4.56956531e+03,  5.78878359e+01,
        4.75027584e+03,  2.56088960e+03, -4.83973907e+03,  2.28415849e+03,
        4.86708011e+03, -4.25836429e+03, -5.27073100e+03, -3.88275874e+02,
        4.73307882e+03, -1.96694625e+03,  2.28415849e+03,  7.71520144e+01,
       -8.55844985e+01, -1.13840790e+03,  5.44318947e+03,  0.00000000e+00,
        2.99858082e+01, -2.99496032e+03, -1.33137457e+03])

In [17]:
model.intercept_

-22327.1743526524

To serialize, we only need to save the coef and intercept

In [18]:
model_param = {}

model_param['coef'] = list(model.coef_)
model_param['intercept'] = model.intercept_.tolist()

In [19]:
# to serialize it, we need to use json dump --
json_txt = json.dumps(model_param, indent=4)
json_txt

'{\n    "coef": [\n        51.10249855073477,\n        -71.96982599431358,\n        760.2231543308437,\n        -42.613515703957034,\n        3.9732175787368504,\n        100.16784306482549,\n        -1718.0130859012665,\n        -4397.9603991152435,\n        -1109.4927599597263,\n        5.752799510941259,\n        2.159562116070447,\n        -67.4796158500065,\n        82.53633693269285,\n        5443.1894694675175,\n        -5443.189469467412,\n        -500.07864414455366,\n        500.07864414454957,\n        -121.80297301009477,\n        121.80297300990009,\n        2953.4005069878294,\n        -2349.3153320980527,\n        -647.3622019599725,\n        405.17863230143575,\n        -361.90160523118476,\n        -844.2861198504702,\n        -384.8625639561998,\n        1229.1486838068122,\n        -3960.2054491250674,\n        3960.205449125066,\n        -243.90738156350204,\n        -4569.565311083428,\n        57.887835904785845,\n        4750.275835159109,\n        2560.889600202

In [20]:
# write it to a file

with open('models/regressor_param.txt', 'w') as file:
    file.write(json_txt)

In [21]:
# to check contents

with open('models/regressor_param.txt', 'r') as file:
    json_loaded = json.load(file)

In [22]:
json_loaded

{'coef': [51.10249855073477,
  -71.96982599431358,
  760.2231543308437,
  -42.613515703957034,
  3.9732175787368504,
  100.16784306482549,
  -1718.0130859012665,
  -4397.9603991152435,
  -1109.4927599597263,
  5.752799510941259,
  2.159562116070447,
  -67.4796158500065,
  82.53633693269285,
  5443.1894694675175,
  -5443.189469467412,
  -500.07864414455366,
  500.07864414454957,
  -121.80297301009477,
  121.80297300990009,
  2953.4005069878294,
  -2349.3153320980527,
  -647.3622019599725,
  405.17863230143575,
  -361.90160523118476,
  -844.2861198504702,
  -384.8625639561998,
  1229.1486838068122,
  -3960.2054491250674,
  3960.205449125066,
  -243.90738156350204,
  -4569.565311083428,
  57.887835904785845,
  4750.275835159109,
  2560.8896002020965,
  -4839.739065319084,
  2284.158486699918,
  4867.080114063341,
  -4258.364294351427,
  -5270.730997613208,
  -388.2758739668004,
  4733.0788199128865,
  -1966.9462547446215,
  2284.158486699893,
  77.15201438811893,
  -85.58449848021286,
  -

In [23]:
new_model = LinearRegression()

In [24]:
new_model.coef_ = np.array(json_loaded['coef'])
new_model.intercept_ = np.array(json_loaded['intercept'])

In [25]:
y_new_pred = new_model.predict(x_test)
r2_score(y_test, y_new_pred)

0.8538245139756244

# Using - Pickle and unpickle

it is easier for using pickle as we do not have to know what parameters to write to json.
Simply pickle and unpickle. Serialization is suitable only in Linear regression. 
This is better approach in case of other models as well like decision trees etc.

Picklw works with any ML model. not just python models

In [26]:
import pickle

In [27]:
pickle.dump(model, open('models/model.pkl','wb'))

In [28]:
pkl_model = pickle.load(open('models/model.pkl','rb'))

In [29]:
y_pkl_pred = pkl_model.predict(x_test)
r2_score(y_test,y_pkl_pred)

0.8538245139756244

# Using Joblib

joblib is better asit works more efficiently with objects that have large NumPy arrays internally

In [30]:
import joblib

In [31]:
filename = 'models/model.joblib'

In [32]:
joblib.dump(model,filename)

['models/model.joblib']

In [33]:
joblib_model = joblib.load(filename)

In [34]:
y_joblib_pred = joblib_model.predict(x_test)
r2_score(y_test,y_joblib_pred)

0.8538245139756244