In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
df = pd.read_csv('vehicle_performance.csv')
df.head()

Unnamed: 0,origin,cylinders,displacement,horsepower,weight,acceleration,year,name,Kilometer_per_liter
0,1,8,307.0,130,3504,12.0,1970,chevrolet chevelle malibu,7.652587
1,1,8,350.0,165,3693,11.5,1970,buick skylark 320,6.377156
2,1,8,318.0,150,3436,11.0,1970,plymouth satellite,7.652587
3,1,8,304.0,150,3433,12.0,1970,amc rebel sst,6.802299
4,1,8,302.0,140,3449,10.5,1970,ford torino,7.227443


In [3]:
df.shape

(398, 9)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   origin               398 non-null    int64  
 1   cylinders            398 non-null    int64  
 2   displacement         398 non-null    float64
 3   horsepower           398 non-null    object 
 4   weight               398 non-null    int64  
 5   acceleration         398 non-null    float64
 6   year                 398 non-null    int64  
 7   name                 398 non-null    object 
 8   Kilometer_per_liter  398 non-null    float64
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB


In [5]:
df[df.duplicated()]

Unnamed: 0,origin,cylinders,displacement,horsepower,weight,acceleration,year,name,Kilometer_per_liter


In [6]:
df.isnull().sum()

origin                 0
cylinders              0
displacement           0
horsepower             0
weight                 0
acceleration           0
year                   0
name                   0
Kilometer_per_liter    0
dtype: int64

In [7]:
df.horsepower.unique()

array(['130', '165', '150', '140', '198', '220', '215', '225', '190',
       '170', '160', '95', '97', '85', '88', '46', '87', '90', '113',
       '200', '210', '193', '?', '100', '105', '175', '153', '180', '110',
       '72', '86', '70', '76', '65', '69', '60', '80', '54', '208', '155',
       '112', '92', '145', '137', '158', '167', '94', '107', '230', '49',
       '75', '91', '122', '67', '83', '78', '52', '61', '93', '148',
       '129', '96', '71', '98', '115', '53', '81', '79', '120', '152',
       '102', '108', '68', '58', '149', '89', '63', '48', '66', '139',
       '103', '125', '133', '138', '135', '142', '77', '62', '132', '84',
       '64', '74', '116', '82'], dtype=object)

In [8]:
df[df.horsepower == '?'].shape

(6, 9)

In [9]:
df.horsepower.replace('?',np.nan, inplace=True)

In [10]:
df[df.horsepower == '?']

Unnamed: 0,origin,cylinders,displacement,horsepower,weight,acceleration,year,name,Kilometer_per_liter


In [11]:
df['horsepower'] = df['horsepower'].astype('float')

In [12]:
df['horsepower']

0      130.0
1      165.0
2      150.0
3      150.0
4      140.0
       ...  
393     86.0
394     52.0
395     84.0
396     79.0
397     82.0
Name: horsepower, Length: 398, dtype: float64

In [13]:
df.isnull().sum()

origin                 0
cylinders              0
displacement           0
horsepower             6
weight                 0
acceleration           0
year                   0
name                   0
Kilometer_per_liter    0
dtype: int64

In [14]:
df.horsepower.fillna(df.horsepower.median(), inplace = True)

In [15]:
df.horsepower.isnull().sum()

np.int64(0)

In [16]:
df.dtypes

origin                   int64
cylinders                int64
displacement           float64
horsepower             float64
weight                   int64
acceleration           float64
year                     int64
name                    object
Kilometer_per_liter    float64
dtype: object

In [17]:
x= df.drop(['name', 'Kilometer_per_liter'], axis=1)
y= df['Kilometer_per_liter']
x

Unnamed: 0,origin,cylinders,displacement,horsepower,weight,acceleration,year
0,1,8,307.0,130.0,3504,12.0,1970
1,1,8,350.0,165.0,3693,11.5,1970
2,1,8,318.0,150.0,3436,11.0,1970
3,1,8,304.0,150.0,3433,12.0,1970
4,1,8,302.0,140.0,3449,10.5,1970
...,...,...,...,...,...,...,...
393,1,4,140.0,86.0,2790,15.6,1982
394,2,4,97.0,52.0,2130,24.6,1982
395,1,4,135.0,84.0,2295,11.6,1982
396,1,4,120.0,79.0,2625,18.6,1982


In [18]:
from sklearn.model_selection import train_test_split

In [19]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.8, random_state = 42)

In [65]:
x_train

Unnamed: 0,origin,cylinders,displacement,horsepower,weight,acceleration,year
3,1,8,304.0,150.0,3433,12.0,1970
18,3,4,97.0,88.0,2130,14.5,1970
376,3,4,91.0,68.0,2025,18.2,1982
248,3,4,91.0,60.0,1800,16.4,1978
177,2,4,115.0,95.0,2694,15.0,1975
...,...,...,...,...,...,...,...
71,3,3,70.0,97.0,2330,13.5,1972
106,1,8,350.0,180.0,4499,12.5,1973
270,3,4,134.0,95.0,2515,14.8,1978
348,3,4,89.0,62.0,2050,17.3,1981


In [20]:
x_train.shape

(318, 7)

In [21]:
x_test.shape

(80, 7)

In [22]:
y_train.shape

(318,)

In [23]:
y_test.shape

(80,)

In [24]:
from sklearn.linear_model import SGDRegressor

In [25]:
model = SGDRegressor()

In [26]:
model.fit(x_train, y_train)

0,1,2
,"loss  loss: str, default='squared_error' The loss function to be used. The possible values are 'squared_error', 'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive' The 'squared_error' refers to the ordinary least squares fit. 'huber' modifies 'squared_error' to focus less on getting outliers correct by switching from squared to linear loss past a distance of epsilon. 'epsilon_insensitive' ignores errors less than epsilon and is linear past that; this is the loss function used in SVR. 'squared_epsilon_insensitive' is the same but becomes squared loss past a tolerance of epsilon. More details about the losses formulas can be found in the :ref:`User Guide `.",'squared_error'
,"penalty  penalty: {'l2', 'l1', 'elasticnet', None}, default='l2' The penalty (aka regularization term) to be used. Defaults to 'l2' which is the standard regularizer for linear SVM models. 'l1' and 'elasticnet' might bring sparsity to the model (feature selection) not achievable with 'l2'. No penalty is added when set to `None`. You can see a visualisation of the penalties in :ref:`sphx_glr_auto_examples_linear_model_plot_sgd_penalties.py`.",'l2'
,"alpha  alpha: float, default=0.0001 Constant that multiplies the regularization term. The higher the value, the stronger the regularization. Also used to compute the learning rate when `learning_rate` is set to 'optimal'. Values must be in the range `[0.0, inf)`.",0.0001
,"l1_ratio  l1_ratio: float, default=0.15 The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1. l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1. Only used if `penalty` is 'elasticnet'. Values must be in the range `[0.0, 1.0]` or can be `None` if `penalty` is not `elasticnet`. .. versionchanged:: 1.7  `l1_ratio` can be `None` when `penalty` is not ""elasticnet"".",0.15
,"fit_intercept  fit_intercept: bool, default=True Whether the intercept should be estimated or not. If False, the data is assumed to be already centered.",True
,"max_iter  max_iter: int, default=1000 The maximum number of passes over the training data (aka epochs). It only impacts the behavior in the ``fit`` method, and not the :meth:`partial_fit` method. Values must be in the range `[1, inf)`. .. versionadded:: 0.19",1000
,"tol  tol: float or None, default=1e-3 The stopping criterion. If it is not None, training will stop when (loss > best_loss - tol) for ``n_iter_no_change`` consecutive epochs. Convergence is checked against the training loss or the validation loss depending on the `early_stopping` parameter. Values must be in the range `[0.0, inf)`. .. versionadded:: 0.19",0.001
,"shuffle  shuffle: bool, default=True Whether or not the training data should be shuffled after each epoch.",True
,"verbose  verbose: int, default=0 The verbosity level. Values must be in the range `[0, inf)`.",0
,"epsilon  epsilon: float, default=0.1 Epsilon in the epsilon-insensitive loss functions; only if `loss` is 'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive'. For 'huber', determines the threshold at which it becomes less important to get the prediction exactly right. For epsilon-insensitive, any differences between the current prediction and the correct label are ignored if they are less than this threshold. Values must be in the range `[0.0, inf)`.",0.1


In [27]:
train_results = pd.DataFrame()

In [28]:
train_results['Actual_values'] = y_train

In [29]:
train_results['Predicted_value']= model.predict(x_train)
train_results

Unnamed: 0,Actual_values,Predicted_value
3,6.802299,-2.735018e+15
18,11.478880,-2.716036e+15
376,15.730317,-2.678658e+15
248,15.347688,-2.565643e+15
177,9.778305,-2.923603e+15
...,...,...
71,8.077730,-2.897604e+15
106,5.101724,-3.103606e+15
270,8.970532,-2.790311e+15
348,16.027918,-2.687535e+15


In [30]:
from sklearn.preprocessing import StandardScaler

In [31]:
ss = StandardScaler()
sx_train = ss.fit_transform(x_train)

In [None]:
sx_train= pd.DataFrame(ss.fit_transform(x_train), columns = x_train.columns)
sx_train

Unnamed: 0,origin,cylinders,displacement,horsepower,weight,acceleration,year
0,-0.729494,1.527188,1.090196,1.265821,0.552826,-1.319334,-1.696667
1,1.738368,-0.850515,-0.922996,-0.408635,-0.999667,-0.413182,-1.696667
2,1.738368,-0.850515,-0.981350,-0.948782,-1.124772,0.927922,1.638975
3,1.738368,-0.850515,-0.981350,-1.164840,-1.392854,0.275493,0.527094
4,0.504437,-0.850515,-0.747936,-0.219583,-0.327675,-0.231952,-0.306816
...,...,...,...,...,...,...,...
313,1.738368,-1.444941,-1.185587,-0.165569,-0.761372,-0.775643,-1.140727
314,-0.729494,1.527188,1.537573,2.076041,1.822940,-1.138103,-0.862757
315,1.738368,-0.850515,-0.563150,-0.219583,-0.540949,-0.304444,0.527094
316,1.738368,-0.850515,-1.000801,-1.110826,-1.094985,0.601707,1.361005


In [33]:
sx_test = pd.DataFrame(ss.transform(x_test), columns = x_test.columns)
sx_test

Unnamed: 0,origin,cylinders,displacement,horsepower,weight,acceleration,year
0,1.738368,-0.850515,-0.981350,-1.353892,-1.398812,0.637953,-0.028846
1,-0.729494,-0.850515,-0.699308,-0.651701,-0.409887,1.072906,1.638975
2,-0.729494,0.338337,0.389956,-0.084547,-0.399163,-0.956873,-1.418697
3,-0.729494,1.527188,1.226354,1.265821,1.156905,-0.884381,-0.028846
4,-0.729494,1.527188,1.226354,1.265821,1.510773,-0.413182,-0.862757
...,...,...,...,...,...,...,...
75,-0.729494,1.527188,0.662271,0.185527,0.471806,-0.050722,0.527094
76,-0.729494,0.338337,0.565016,0.185527,0.656485,0.275493,0.249124
77,-0.729494,-0.850515,-0.777113,-0.408635,-0.433716,1.435367,1.638975
78,0.504437,-0.850515,-0.991075,-0.894767,-1.229622,-0.594413,-0.306816


In [62]:
import pickle

In [77]:
with open('scaler.pkl', 'wb') as file:
    pickle.dump(ss, file)

In [34]:
model1= SGDRegressor()
model1.fit(sx_train, y_train)

0,1,2
,"loss  loss: str, default='squared_error' The loss function to be used. The possible values are 'squared_error', 'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive' The 'squared_error' refers to the ordinary least squares fit. 'huber' modifies 'squared_error' to focus less on getting outliers correct by switching from squared to linear loss past a distance of epsilon. 'epsilon_insensitive' ignores errors less than epsilon and is linear past that; this is the loss function used in SVR. 'squared_epsilon_insensitive' is the same but becomes squared loss past a tolerance of epsilon. More details about the losses formulas can be found in the :ref:`User Guide `.",'squared_error'
,"penalty  penalty: {'l2', 'l1', 'elasticnet', None}, default='l2' The penalty (aka regularization term) to be used. Defaults to 'l2' which is the standard regularizer for linear SVM models. 'l1' and 'elasticnet' might bring sparsity to the model (feature selection) not achievable with 'l2'. No penalty is added when set to `None`. You can see a visualisation of the penalties in :ref:`sphx_glr_auto_examples_linear_model_plot_sgd_penalties.py`.",'l2'
,"alpha  alpha: float, default=0.0001 Constant that multiplies the regularization term. The higher the value, the stronger the regularization. Also used to compute the learning rate when `learning_rate` is set to 'optimal'. Values must be in the range `[0.0, inf)`.",0.0001
,"l1_ratio  l1_ratio: float, default=0.15 The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1. l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1. Only used if `penalty` is 'elasticnet'. Values must be in the range `[0.0, 1.0]` or can be `None` if `penalty` is not `elasticnet`. .. versionchanged:: 1.7  `l1_ratio` can be `None` when `penalty` is not ""elasticnet"".",0.15
,"fit_intercept  fit_intercept: bool, default=True Whether the intercept should be estimated or not. If False, the data is assumed to be already centered.",True
,"max_iter  max_iter: int, default=1000 The maximum number of passes over the training data (aka epochs). It only impacts the behavior in the ``fit`` method, and not the :meth:`partial_fit` method. Values must be in the range `[1, inf)`. .. versionadded:: 0.19",1000
,"tol  tol: float or None, default=1e-3 The stopping criterion. If it is not None, training will stop when (loss > best_loss - tol) for ``n_iter_no_change`` consecutive epochs. Convergence is checked against the training loss or the validation loss depending on the `early_stopping` parameter. Values must be in the range `[0.0, inf)`. .. versionadded:: 0.19",0.001
,"shuffle  shuffle: bool, default=True Whether or not the training data should be shuffled after each epoch.",True
,"verbose  verbose: int, default=0 The verbosity level. Values must be in the range `[0, inf)`.",0
,"epsilon  epsilon: float, default=0.1 Epsilon in the epsilon-insensitive loss functions; only if `loss` is 'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive'. For 'huber', determines the threshold at which it becomes less important to get the prediction exactly right. For epsilon-insensitive, any differences between the current prediction and the correct label are ignored if they are less than this threshold. Values must be in the range `[0.0, inf)`.",0.1


In [35]:
train_result_scaling = pd.DataFrame()

In [36]:
train_result_scaling['actual']= y_train

In [37]:
train_result_scaling['predicted'] = model1.predict(sx_train)
train_result_scaling['error'] = train_result_scaling['actual'] - train_result_scaling['predicted']
train_result_scaling

Unnamed: 0,actual,predicted,error
3,6.802299,6.334511,0.467788
18,11.478880,10.832864,0.646016
376,15.730317,15.136750,0.593568
248,15.347688,14.457792,0.889896
177,9.778305,10.562734,-0.784429
...,...,...,...
71,8.077730,10.932311,-2.854580
106,5.101724,4.592967,0.508757
270,8.970532,12.525498,-3.554966
348,16.027918,14.826406,1.201512


In [38]:
from sklearn.metrics import mean_squared_error, root_mean_squared_error, mean_absolute_error, r2_score

In [39]:
mse = mean_squared_error(train_result_scaling['actual'], train_result_scaling['predicted'])
rmse = root_mean_squared_error(train_result_scaling['actual'], train_result_scaling['predicted'])
mae = mean_absolute_error(train_result_scaling['actual'], train_result_scaling['predicted'])
r2 = r2_score(train_result_scaling['actual'], train_result_scaling['predicted'])

In [40]:
print('Training: ')
print()
print(F'Mean Squared Error = {mse}')
print(F'Root Mean Squared Error = {rmse}')
print(F'Mean Absolute Error = {mae}')
print(F'R2 = {r2}')

Training: 

Mean Squared Error = 2.1344194935538696
Root Mean Squared Error = 1.4609652608990638
Mean Absolute Error = 1.1046965304422178
R2 = 0.81164944987152


In [41]:
test_result_scaling = pd.DataFrame()

In [42]:
test_result_scaling['actual'] = y_test
test_result_scaling['predicted'] = model1.predict(sx_test)
test_result_scaling

Unnamed: 0,actual,predicted
198,14.029742,13.867778
396,11.904024,12.571944
33,8.077730,8.932399
208,5.526868,7.124126
93,5.952012,5.434729
...,...,...
249,8.460360,9.412865
225,7.440015,8.712842
367,11.904024,12.501424
175,12.329168,12.553373


In [43]:
mse_test = mean_squared_error(test_result_scaling['actual'], test_result_scaling['predicted'])
rmse_test = root_mean_squared_error(test_result_scaling['actual'], test_result_scaling['predicted'])
mae_test = mean_absolute_error(test_result_scaling['actual'], test_result_scaling['predicted'])
r2_test = r2_score(test_result_scaling['actual'], test_result_scaling['predicted'])

In [44]:
print('Testing: ')
print()
print(F'Mean Squared Error = {mse_test}')
print(F'Root Mean Squared Error = {rmse_test}')
print(F'Mean Absolute Error = {mae_test}')
print(F'R2 = {r2_test}')

Testing: 

Mean Squared Error = 1.5495348784174157
Root Mean Squared Error = 1.2448031484605973
Mean Absolute Error = 0.9716174833836095
R2 = 0.8405521993271514


In [45]:
from sklearn.model_selection import cross_val_score

In [46]:
scores_train = cross_val_score(model1, sx_train, y_train, cv=5, scoring = "r2")
scores_train

array([0.83360106, 0.78736584, 0.81883688, 0.77486564, 0.77883659])

In [47]:
np.mean(scores_train)

np.float64(0.798701201829498)

In [48]:
scores_test = cross_val_score(model1, sx_test, y_test, cv=5, scoring = "r2")
scores_test

array([0.91043604, 0.67385435, 0.82690119, 0.87586562, 0.81900668])

In [49]:
np.mean(scores_test)

np.float64(0.8212127774449447)

In [50]:
training_accuracy = model1.score(sx_train, y_train)
testing_accuracy = model1.score(sx_test, y_test)
print(F'Training Accuracy = {training_accuracy}')
print(F'Testing Accuracy = {testing_accuracy}')

Training Accuracy = 0.81164944987152
Testing Accuracy = 0.8405521993271514


ANN Implementation

In [51]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
import datetime

In [54]:
model = Sequential([
    Dense(64, activation = 'relu', input_shape = (x_train.shape[1],)), ## HL1
    Dense(32, activation = 'relu'), ## HL2
    Dense(1), ## Output layer
]
)

In [55]:
model.summary()

In [57]:
import tensorflow
opt = tensorflow.keras.optimizers.Adam(learning_rate = 0.01)
loss = tensorflow.keras.losses.MeanSquaredError()
loss

<LossFunctionWrapper(<function mean_squared_error at 0x00000151AE6C7100>, kwargs={})>

In [58]:
model.compile(optimizer = opt, loss = "mean_squared_error", metrics = ['accuracy'])

In [59]:
from tensorflow.keras.callbacks import EarlyStopping

In [60]:
## Set up Early Stopping

early_stopping_callbacks = EarlyStopping(monitor = 'val_loss', patience = 10, restore_best_weights = True)

In [61]:
## Train the model
history = model.fit(
    sx_train, y_train, validation_data = (sx_test, y_test), epochs = 100,
    callbacks = [early_stopping_callbacks]
)

Epoch 1/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 39ms/step - accuracy: 0.0000e+00 - loss: 64.0920 - val_accuracy: 0.0000e+00 - val_loss: 12.8123
Epoch 2/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.0000e+00 - loss: 13.9735 - val_accuracy: 0.0000e+00 - val_loss: 5.3117
Epoch 3/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.0000e+00 - loss: 7.4303 - val_accuracy: 0.0000e+00 - val_loss: 4.1109
Epoch 4/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.0000e+00 - loss: 4.4125 - val_accuracy: 0.0000e+00 - val_loss: 3.1277
Epoch 5/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.0000e+00 - loss: 3.1775 - val_accuracy: 0.0000e+00 - val_loss: 2.0139
Epoch 6/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.0000e+00 - loss: 2.7070 - val_accuracy: 0.0000e

In [64]:
model.save('vehicle_model.keras')