In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score


In [2]:
df = pd.read_csv('car.csv')
df.head(398)

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,US Made
0,18.0,8,307.0,130,3504,12.0,70,1
1,15.0,8,350.0,165,3693,11.5,70,1
2,18.0,8,318.0,150,3436,11.0,70,1
3,16.0,8,304.0,150,3433,12.0,70,1
4,17.0,8,302.0,140,3449,10.5,70,1
...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86,2790,15.6,82,1
394,44.0,4,97.0,52,2130,24.6,82,0
395,32.0,4,135.0,84,2295,11.6,82,1
396,28.0,4,120.0,79,2625,18.6,82,1


In [3]:
list(df.columns)

['MPG',
 'Cylinders',
 'Displacement',
 'Horsepower',
 'Weight',
 'Acceleration',
 'Model Year',
 'US Made']

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   MPG           398 non-null    float64
 1   Cylinders     398 non-null    int64  
 2   Displacement  398 non-null    float64
 3   Horsepower    398 non-null    object 
 4   Weight        398 non-null    int64  
 5   Acceleration  398 non-null    float64
 6   Model Year    398 non-null    int64  
 7   US Made       398 non-null    int64  
dtypes: float64(3), int64(4), object(1)
memory usage: 25.0+ KB


In [5]:
df.describe()

Unnamed: 0,MPG,Cylinders,Displacement,Weight,Acceleration,Model Year,US Made
count,398.0,398.0,398.0,398.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,2970.424623,15.56809,76.01005,0.625628
std,7.815984,1.701004,104.269838,846.841774,2.757689,3.697627,0.484569
min,9.0,3.0,68.0,1613.0,8.0,70.0,0.0
25%,17.5,4.0,104.25,2223.75,13.825,73.0,0.0
50%,23.0,4.0,148.5,2803.5,15.5,76.0,1.0
75%,29.0,8.0,262.0,3608.0,17.175,79.0,1.0
max,46.6,8.0,455.0,5140.0,24.8,82.0,1.0


In [6]:
 df['Horsepower'].unique()

array(['130', '165', '150', '140', '198', '220', '215', '225', '190',
       '170', '160', '95', '97', '85', '88', '46', '87', '90', '113',
       '200', '210', '193', '?', '100', '105', '175', '153', '180', '110',
       '72', '86', '70', '76', '65', '69', '60', '80', '54', '208', '155',
       '112', '92', '145', '137', '158', '167', '94', '107', '230', '49',
       '75', '91', '122', '67', '83', '78', '52', '61', '93', '148',
       '129', '96', '71', '98', '115', '53', '81', '79', '120', '152',
       '102', '108', '68', '58', '149', '89', '63', '48', '66', '139',
       '103', '125', '133', '138', '135', '142', '77', '62', '132', '84',
       '64', '74', '116', '82'], dtype=object)

In [7]:
df.loc[df['Horsepower'] == '?']

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,US Made
32,25.0,4,98.0,?,2046,19.0,71,1
126,21.0,6,200.0,?,2875,17.0,74,1
330,40.9,4,85.0,?,1835,17.3,80,0
336,23.6,4,140.0,?,2905,14.3,80,1
354,34.5,4,100.0,?,2320,15.8,81,0
374,23.0,4,151.0,?,3035,20.5,82,1


In [8]:
df = df.drop(labels = [32,126,330,336,354,374] , axis = 0 )

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 392 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   MPG           392 non-null    float64
 1   Cylinders     392 non-null    int64  
 2   Displacement  392 non-null    float64
 3   Horsepower    392 non-null    object 
 4   Weight        392 non-null    int64  
 5   Acceleration  392 non-null    float64
 6   Model Year    392 non-null    int64  
 7   US Made       392 non-null    int64  
dtypes: float64(3), int64(4), object(1)
memory usage: 27.6+ KB


In [10]:
df['Horsepower'] = pd.to_numeric(df['Horsepower'])

In [11]:
cols = df.columns

In [12]:
# copy the data
df_min_max_scaled = df.copy()
  
# apply normalization techniques by Column 

for i in cols:
    df_min_max_scaled[i] = (df_min_max_scaled[i] - df_min_max_scaled[i].min()) / (df_min_max_scaled[i].max() - df_min_max_scaled[i].min())    
  
# view normalized data
display(df_min_max_scaled)

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,US Made
0,0.239362,1.0,0.617571,0.456522,0.536150,0.238095,0.0,1.0
1,0.159574,1.0,0.728682,0.646739,0.589736,0.208333,0.0,1.0
2,0.239362,1.0,0.645995,0.565217,0.516870,0.178571,0.0,1.0
3,0.186170,1.0,0.609819,0.565217,0.516019,0.238095,0.0,1.0
4,0.212766,1.0,0.604651,0.510870,0.520556,0.148810,0.0,1.0
...,...,...,...,...,...,...,...,...
393,0.478723,0.2,0.186047,0.217391,0.333711,0.452381,1.0,1.0
394,0.930851,0.2,0.074935,0.032609,0.146583,0.988095,1.0,0.0
395,0.611702,0.2,0.173127,0.206522,0.193365,0.214286,1.0,1.0
396,0.505319,0.2,0.134367,0.179348,0.286929,0.630952,1.0,1.0


In [13]:
df1 = df_min_max_scaled.reset_index()
df1.drop( labels = ['index'] , axis = 1)


Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,US Made
0,0.239362,1.0,0.617571,0.456522,0.536150,0.238095,0.0,1.0
1,0.159574,1.0,0.728682,0.646739,0.589736,0.208333,0.0,1.0
2,0.239362,1.0,0.645995,0.565217,0.516870,0.178571,0.0,1.0
3,0.186170,1.0,0.609819,0.565217,0.516019,0.238095,0.0,1.0
4,0.212766,1.0,0.604651,0.510870,0.520556,0.148810,0.0,1.0
...,...,...,...,...,...,...,...,...
387,0.478723,0.2,0.186047,0.217391,0.333711,0.452381,1.0,1.0
388,0.930851,0.2,0.074935,0.032609,0.146583,0.988095,1.0,0.0
389,0.611702,0.2,0.173127,0.206522,0.193365,0.214286,1.0,1.0
390,0.505319,0.2,0.134367,0.179348,0.286929,0.630952,1.0,1.0


In [14]:
df1.describe()

Unnamed: 0,index,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,US Made
count,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0
mean,197.579082,0.3842,0.494388,0.326646,0.317768,0.386897,0.448888,0.498299,0.625
std,114.534637,0.20758,0.341157,0.270398,0.209191,0.240829,0.164218,0.306978,0.484742
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,98.75,0.212766,0.2,0.095607,0.157609,0.173589,0.34375,0.25,0.0
50%,197.5,0.365691,0.2,0.21447,0.258152,0.337539,0.446429,0.5,1.0
75%,295.25,0.531915,1.0,0.536822,0.434783,0.56755,0.537202,0.75,1.0
max,397.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [15]:
x = df1.drop(labels = ['MPG'],axis = 1)
y = df1['MPG']

In [16]:
x

Unnamed: 0,index,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,US Made
0,0,1.0,0.617571,0.456522,0.536150,0.238095,0.0,1.0
1,1,1.0,0.728682,0.646739,0.589736,0.208333,0.0,1.0
2,2,1.0,0.645995,0.565217,0.516870,0.178571,0.0,1.0
3,3,1.0,0.609819,0.565217,0.516019,0.238095,0.0,1.0
4,4,1.0,0.604651,0.510870,0.520556,0.148810,0.0,1.0
...,...,...,...,...,...,...,...,...
387,393,0.2,0.186047,0.217391,0.333711,0.452381,1.0,1.0
388,394,0.2,0.074935,0.032609,0.146583,0.988095,1.0,0.0
389,395,0.2,0.173127,0.206522,0.193365,0.214286,1.0,1.0
390,396,0.2,0.134367,0.179348,0.286929,0.630952,1.0,1.0


In [17]:
 X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=42)

In [18]:
reg = LinearRegression().fit(X_train, y_train)


In [19]:
y_pred = reg.predict(X_test)
y_pred

array([ 0.48242696,  0.47351222,  0.68914428,  0.39995785,  0.49932814,
        0.5634025 , -0.01172452,  0.56836784,  0.31817665,  0.55115386,
        0.09826889,  0.41838694,  0.20784106,  0.54654518,  0.36178779,
        0.54433329,  0.30820313,  0.64336458,  0.45572776,  0.58457427,
        0.2561787 ,  0.71135597,  0.70662224,  0.16802093,  0.55098963,
        0.47104219,  0.352858  ,  0.19378526,  0.56922936,  0.4381071 ,
        0.12574469,  0.38514796,  0.31178512,  0.60186243,  0.10324436,
        0.72155827,  0.04181078,  0.41319708,  0.06083121, -0.0703491 ,
        0.1015693 ,  0.52197191,  0.73492597,  0.49427633,  0.08568169,
        0.0301852 ,  0.22222285,  0.62296741,  0.39682321,  0.6052787 ,
        0.07895267,  0.47732474,  0.37906897,  0.69723541,  0.50508643,
        0.23156016,  0.30148779,  0.36004377,  0.39151163,  0.45879092,
       -0.00619458,  0.3661581 ,  0.42309167,  0.40840616,  0.54464063,
        0.57947541,  0.44068759,  0.57580545,  0.34891401,  0.03

In [20]:
y_test

78     0.452128
274    0.335106
246    0.720745
55     0.452128
387    0.478723
         ...   
79     0.345745
310    0.750000
352    0.635638
248    0.276596
153    0.159574
Name: MPG, Length: 118, dtype: float64

In [21]:
mse = mean_squared_error(y_test, y_pred).round(2)
rmse = (mse**.5).round(2)
print(mse)
print(rmse)

0.01
0.1


In [24]:
mae = mean_absolute_error(y_test, y_pred).round(2)
mae

0.07

In [25]:
r2_score(y_test, y_pred)

0.8074857456612935