In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.base import BaseEstimator , TransformerMixin
from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter("ignore")

In [2]:

cols=['MPG','Cylinders','Displacement','Horsepower','Weight','Acceleration','model_year','Origin']
df=pd.read_csv('auto-mpg.data', names=cols, na_values="?", comment='\t', sep=" ",skipinitialspace=True)
data=df.copy()

split=StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)
for train_index,test_index in split.split(data,data['Cylinders']):
    strat_train_set= data.loc[train_index]
    strat_test_set= data.loc[test_index]

In [3]:
data.columns

Index(['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',
       'Acceleration', 'model_year', 'Origin'],
      dtype='object')

In [4]:
data=strat_train_set.drop("MPG", axis=1)   
data_labels=strat_train_set["MPG"].copy()
#data

In [5]:
data.head(2)

Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,model_year,Origin
145,4,83.0,61.0,2003.0,19.0,74,3
151,4,79.0,67.0,2000.0,16.0,74,2


In [6]:
data["Origin"].value_counts()

1    198
3     60
2     60
Name: Origin, dtype: int64

In [7]:

data["Origin"] = data["Origin"].replace({1:"india", 2:"usa", 3:"germany"})

data.head()

Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,model_year,Origin
145,4,83.0,61.0,2003.0,19.0,74,germany
151,4,79.0,67.0,2000.0,16.0,74,usa
388,4,156.0,92.0,2585.0,14.5,82,india
48,6,250.0,88.0,3139.0,14.5,71,india
114,4,98.0,90.0,2265.0,15.5,73,usa


In [8]:
acc_ix,hpower_ix,cyl_ix=4,2,0
class CustomAttrAdder(BaseEstimator,TransformerMixin):
    def __init__(self,acc_on_power=True):
        self.acc_on_power=acc_on_power
    def fit(self,x,y=None):
        return self
    def transform(self,x):
        acc_on_cyl=x[:,acc_ix]/x[:,cyl_ix]
        if self.acc_on_power:
            acc_on_power=x[:,acc_ix]/x[:,hpower_ix]
            return np.c_[x,acc_on_cyl,acc_on_power]
        return np.c_[x,acc_on_cyl]

In [9]:
data.dtypes

Cylinders         int64
Displacement    float64
Horsepower      float64
Weight          float64
Acceleration    float64
model_year        int64
Origin           object
dtype: object

In [10]:
def num_pipeline_transformer(data):
    
    numerics = ['float64', 'int64']

    num_attrs = data.select_dtypes(include=numerics)

    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attrs_adder', CustomAttrAdder()),
        ('std_scaler', StandardScaler()),
        ])
    return num_attrs, num_pipeline

def pipeline_transformer(data):
    
   
    cat_attrs=["Origin"]
    num_attrs,num_pipeline = num_pipeline_transformer(data)
    
    full_pipeline=ColumnTransformer([
        ("num",num_pipeline,list(num_attrs)),
        ("cat", OneHotEncoder() ,cat_attrs)
    ])
    prepared_data =full_pipeline.fit_transform(data)
    return prepared_data

In [11]:
data.isna().sum()

Cylinders       0
Displacement    0
Horsepower      4
Weight          0
Acceleration    0
model_year      0
Origin          0
dtype: int64

In [12]:
#data['Horsepower'].fillna()min()

In [13]:
#preprocessed_df=preprocess_origin_cols(data)
prepared_data=pipeline_transformer(data)

In [14]:
#preprocessed_df.isna().sum()

In [15]:
#preprocessed_df.Origin.value_counts()

In [16]:
#preprocessed_df.head()

In [17]:
prepared_data

array([[-0.85657842, -1.07804475, -1.15192977, ...,  1.        ,
         0.        ,  0.        ],
       [-0.85657842, -1.1174582 , -0.9900351 , ...,  0.        ,
         0.        ,  1.        ],
       [-0.85657842, -0.3587492 , -0.31547399, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.85657842, -0.56566984, -0.53133355, ...,  0.        ,
         1.        ,  0.        ],
       [-0.85657842, -0.78244384, -0.23452666, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.32260746, -0.45728283,  0.44003446, ...,  1.        ,
         0.        ,  0.        ]])

In [18]:
from sklearn.linear_model import LinearRegression

lin_reg=LinearRegression()
lin_reg.fit(prepared_data,data_labels)

LinearRegression()

In [19]:
sample_data=data.iloc[:5]
sample_label=data_labels[:5]
sample_data_prepared=pipeline_transformer(sample_data)

print("prediction of samples:",lin_reg.predict(sample_data_prepared))

prediction of samples: [29.08069379 27.78336755 26.08031176 12.70419279 22.23454159]


In [20]:
print("actual sample:",list(sample_label))

actual sample: [32.0, 31.0, 26.0, 18.0, 26.0]


mean squared error

In [21]:
from sklearn.metrics import mean_squared_error

mpg_predictions =lin_reg.predict(prepared_data)
lin_mse=mean_squared_error(data_labels,mpg_predictions)
lin_rmse =np.sqrt(lin_mse)
lin_mse


8.755919038823139

In [22]:
from sklearn.tree import DecisionTreeRegressor
tree_reg=DecisionTreeRegressor()
tree_reg.fit(prepared_data,data_labels)

DecisionTreeRegressor()

In [23]:
from sklearn.metrics import mean_squared_error
mpg_predictions=tree_reg.predict(prepared_data)
tree_mse=mean_squared_error(data_labels,mpg_predictions)
tree_rmse=np.sqrt(tree_mse)
tree_rmse

0.0

In [24]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg,prepared_data,data_labels,scoring="neg_mean_squared_error",cv=10)
tree_reg_rmse_scores=np.sqrt(-scores)
tree_reg_rmse_scores

array([2.92398487, 2.83058077, 2.8951468 , 3.42614032, 2.26218921,
       2.88747294, 3.719627  , 4.1649955 , 4.16630106, 2.64958913])

In [25]:
tree_reg_rmse_scores.mean()

3.1926027597542523

In [26]:
scores = cross_val_score(lin_reg,prepared_data,data_labels,scoring="neg_mean_squared_error",cv=10)
lin_reg_rmse_scores=np.sqrt(-scores)
lin_reg_rmse_scores

array([3.43254597, 3.45157629, 3.6621715 , 2.59652976, 2.48023405,
       2.74798115, 3.32524647, 2.42208917, 3.78133275, 2.8573747 ])

In [27]:
lin_reg_rmse_scores.mean()

3.075708179370932

In [28]:
from sklearn.ensemble import RandomForestRegressor
ran_reg=RandomForestRegressor()
ran_reg.fit(prepared_data,data_labels)

from sklearn.metrics import mean_squared_error
mpg_predictions=ran_reg.predict(prepared_data)
mse=mean_squared_error(data_labels,mpg_predictions)
rmse=np.sqrt(mse)
mse

0.9151182138364767

In [29]:
scores = cross_val_score(ran_reg,prepared_data,data_labels,scoring="neg_mean_squared_error",cv=10)
ran_reg_rmse_scores=np.sqrt(-scores)
ran_reg_rmse_scores

array([2.19822903, 2.40834666, 2.8082186 , 2.41865354, 1.99991805,
       2.57054174, 2.65909899, 2.49316711, 4.15228599, 1.94667008])

In [30]:
ran_reg_rmse_scores.mean()

2.5655129794543963

svmr

In [31]:
from sklearn.svm import SVR
svm_reg=SVR(kernel='linear')
svm_reg.fit(prepared_data,data_labels)
scores = cross_val_score(svm_reg,prepared_data,data_labels,scoring="neg_mean_squared_error",cv=10)
svm_reg_rmse_scores=np.sqrt(-scores)
svm_reg_rmse_scores.mean()


3.0865916208028232

hyperparameter tuning using gridsearchcv

In [32]:
from sklearn.model_selection import GridSearchCV
param_grid = [{'n_estimators':[30,10,30],'max_features':[2,4,6,8]},{'bootstrap':[False],'n_estimators':[3,10],'max_features':[2,3,4]}]
forest_reg=RandomForestRegressor()
grid_search=GridSearchCV(forest_reg,param_grid,scoring="neg_mean_squared_error",return_train_score=True,cv=10)
grid_search.fit(prepared_data,data_labels)

GridSearchCV(cv=10, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [30, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [33]:
grid_search.best_params_

{'max_features': 8, 'n_estimators': 30}

In [34]:
cv_scores = grid_search.cv_results_

for mean_score, params in zip(cv_scores['mean_test_score'], cv_scores["params"]):
    print(np.sqrt(-mean_score), params)


2.855720961295302 {'max_features': 2, 'n_estimators': 30}
2.873371098861787 {'max_features': 2, 'n_estimators': 10}
2.8450545526766433 {'max_features': 2, 'n_estimators': 30}
2.776125471258181 {'max_features': 4, 'n_estimators': 30}
2.9128663465429683 {'max_features': 4, 'n_estimators': 10}
2.7161309599047407 {'max_features': 4, 'n_estimators': 30}
2.675719871817167 {'max_features': 6, 'n_estimators': 30}
2.877460721124252 {'max_features': 6, 'n_estimators': 10}
2.6621705256204153 {'max_features': 6, 'n_estimators': 30}
2.6932049074682585 {'max_features': 8, 'n_estimators': 30}
2.7204919447355413 {'max_features': 8, 'n_estimators': 10}
2.658936084841193 {'max_features': 8, 'n_estimators': 30}
3.3816190072530428 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
2.992520085428592 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
3.1164666106513415 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
2.8496307202750826 {'bootstrap': False, 'max_features': 3, 'n_es

In [35]:
feature_importances=grid_search.best_estimator_.feature_importances_
feature_importances

array([0.19154312, 0.29383954, 0.09873704, 0.18958405, 0.01468611,
       0.12529611, 0.05541322, 0.02289115, 0.00398861, 0.00217522,
       0.00184582])

In [36]:
extra_attrs =["acc_on_power","acc_on_cyl"]
numerics = ['float64', 'int64']
num_attrs = list(data.select_dtypes(include=numerics))
attrs=num_attrs+extra_attrs
sorted(zip(attrs,feature_importances),reverse=True)

[('model_year', 0.12529611077576103),
 ('acc_on_power', 0.055413220731383064),
 ('acc_on_cyl', 0.022891149281757495),
 ('Weight', 0.18958404935849035),
 ('Horsepower', 0.0987370429430045),
 ('Displacement', 0.29383954481796964),
 ('Cylinders', 0.19154312241718713),
 ('Acceleration', 0.014686110039627808)]

In [37]:
final_model=grid_search.best_estimator_

x_test = strat_test_set.drop("MPG",axis=1)
y_test = strat_test_set["MPG"].copy()
x_test["Origin"] = x_test["Origin"].replace({1:"india", 2:"usa", 3:"germany"})
x_test_prepared = pipeline_transformer(x_test)

final_predictions=final_model.predict(x_test_prepared)
final_mse = mean_squared_error(y_test,final_predictions)
final_rmse = np.sqrt(final_mse)


In [38]:
final_rmse

3.0752572159812024

In [62]:
def predict_mpg(config, model):
    list=['Cylinders','Displacement','Horsepower','Weight','Acceleration','model_year','Origin']
    con={}
    for key in list:
        for value in config:
            con[key] = value
            config.remove(value)
            break  
   
    
    
    if type(con) == dict:
        df = pd.DataFrame(con)
        print(df)
      
    else:
        df = con
        
    print(df)
    
    df["Origin"] = df["Origin"].replace({1:"india", 2:"usa", 3:"germany"})
    prepared_df = pipeline_transformer(df)
    y_pred = model.predict(prepared_df)
    return y_pred

In [63]:
'''vehicle_config = {
    'Cylinders': [4, 6, 8],
    'Displacement': [155.0, 160.0, 165.5],
    'Horsepower': [93.0, 130.0, 98.0],
    'Weight': [2500.0, 3150.0, 2600.0],
    'Acceleration': [15.0, 14.0, 16.0],
    'Model Year': [81, 80, 78],
    'Origin': [3,2,1]
}'''
vehicle_config = [ [4, 6, 8],[155.0, 160.0, 165.5],[93.0, 130.0, 98.0],[2500.0, 3150.0, 2600.0],[15.0, 14.0, 16.0],[81, 80, 78],[3, 2, 1]]
predict_mpg(vehicle_config, final_model)


   Cylinders  Displacement  Horsepower  Weight  Acceleration  model_year  \
0          4         155.0        93.0  2500.0          15.0          81   
1          6         160.0       130.0  3150.0          14.0          80   
2          8         165.5        98.0  2600.0          16.0          78   

   Origin  
0       3  
1       2  
2       1  
   Cylinders  Displacement  Horsepower  Weight  Acceleration  model_year  \
0          4         155.0        93.0  2500.0          15.0          81   
1          6         160.0       130.0  3150.0          14.0          80   
2          8         165.5        98.0  2600.0          16.0          78   

   Origin  
0       3  
1       2  
2       1  


array([33.53333333, 18.04333333, 19.10666667])

In [41]:
import pickle

In [42]:
x_test

Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,model_year,Origin
128,6,250.0,100.0,3336.0,17.0,74,india
100,6,250.0,88.0,3021.0,16.5,73,india
330,4,85.0,,1835.0,17.3,80,usa
57,4,113.0,95.0,2278.0,15.5,72,germany
160,6,231.0,110.0,3907.0,21.0,75,india
...,...,...,...,...,...,...,...
266,4,98.0,68.0,2155.0,16.5,78,india
389,6,232.0,112.0,2835.0,14.7,82,india
217,4,111.0,80.0,2155.0,14.8,77,india
66,8,304.0,150.0,3672.0,11.5,72,india


In [43]:
file = open("model.pkl",'wb')

#dump the model to that file
pickle.dump(final_model,file)

In [44]:
reg = pickle.load(open('model.pkl','rb'))
ypred=reg.predict(x_test_prepared)
#predict_mpg(vehicle_config,model)
ypred

array([19.03333333, 19.7       , 32.25666667, 24.21666667, 17.14      ,
       19.62333333, 18.37666667, 14.63333333, 25.21333333, 15.69666667,
       24.04666667, 37.5       , 29.8       , 23.38      , 29.49333333,
       20.72666667, 14.75      , 28.63333333, 13.8       , 36.74      ,
       21.35333333, 15.25      , 17.87333333, 13.91666667, 17.63333333,
       34.01      , 24.56666667, 19.98333333, 26.58      , 13.92333333,
       20.26333333, 33.58      , 15.00666667, 18.11666667, 30.28333333,
       14.6       , 24.30333333, 14.63333333, 32.85      , 34.72      ,
       32.80333333, 31.72      , 39.14      , 33.13333333, 20.36333333,
       24.22666667, 30.29      , 24.04333333, 15.31666667, 21.12666667,
       23.24666667, 25.61666667, 13.88333333, 23.92666667, 25.56333333,
       23.35666667, 27.46666667, 13.28333333, 13.66666667, 32.72      ,
       26.82333333, 19.96      , 30.62      , 33.89666667, 25.61666667,
       26.86      , 25.89      , 34.56666667, 23.19      , 13.88

In [45]:
vehicle_config = {
    'Cylinders': [4, 6, 8],
    'Displacement': [155.0, 160.0, 165.5],
    'Horsepower': [93.0, 130.0, 98.0],
    'Weight': [2500.0, 3150.0, 2600.0],
    'Acceleration': [15.0, 14.0, 16.0],
    'Model Year': [81, 80, 78],
    'Origin': [3, 2, 1]
}