In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model,model_selection

In [2]:
cardataset = pd.read_csv('autompg.csv')

In [3]:
cardataset.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


# Dropping the columns not contributing in the outcome 

In [4]:
car = cardataset.drop(['car name','origin','model year'],axis=1)

In [5]:
car.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration
0,18.0,8,307.0,130,3504,12.0
1,15.0,8,350.0,165,3693,11.5
2,18.0,8,318.0,150,3436,11.0
3,16.0,8,304.0,150,3433,12.0
4,17.0,8,302.0,140,3449,10.5


In [6]:
car.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
dtypes: float64(3), int64(2), object(1)
memory usage: 18.8+ KB


# Missing Value Treatment 

In [7]:
car.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
dtype: int64

In [8]:
car[car['horsepower'] =='?']

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration
32,25.0,4,98.0,?,2046,19.0
126,21.0,6,200.0,?,2875,17.0
330,40.9,4,85.0,?,1835,17.3
336,23.6,4,140.0,?,2905,14.3
354,34.5,4,100.0,?,2320,15.8
374,23.0,4,151.0,?,3035,20.5


In [9]:
car[car['horsepower'] =='?']['horsepower'].value_counts()

?    6
Name: horsepower, dtype: int64

In [10]:
car['horsepower'].value_counts()

150    22
90     20
88     19
110    18
100    17
       ..
138     1
82      1
142     1
149     1
103     1
Name: horsepower, Length: 94, dtype: int64

In [11]:
car['horsepower'].replace('?',150,inplace=True)  #replaced the '?' to the max occurring value in horsepower column

In [12]:
car['horsepower'] = car['horsepower'].astype(str).astype(int)  #converting the object type to string type

In [13]:
car.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    int32  
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
dtypes: float64(3), int32(1), int64(2)
memory usage: 17.2 KB


In [14]:
car['horsepower'].value_counts()

150    28
90     20
88     19
110    18
100    17
       ..
103     1
94      1
93      1
91      1
102     1
Name: horsepower, Length: 93, dtype: int64

In [15]:
car['horsepower'].describe()

count    398.000000
mean     105.155779
std       38.600986
min       46.000000
25%       76.000000
50%       95.000000
75%      130.000000
max      230.000000
Name: horsepower, dtype: float64

# Converting to array and reshaping it 

In [16]:
x = np.array(car.drop('mpg',axis=1))

In [17]:
y = np.array(car['mpg'])

In [18]:
x.shape

(398, 5)

In [19]:
y.shape

(398,)

In [20]:
y = y.reshape(len(y),1)

In [21]:
y.shape

(398, 1)

# Splitting data into train and test data

In [22]:
lr = linear_model.LinearRegression()

In [23]:
x_train,x_test,y_train,y_test = model_selection.train_test_split(x,y,test_size = 0.20)

In [24]:
x_train.shape

(318, 5)

In [25]:
x_test.shape

(80, 5)

# Applying linear regression

In [26]:
lr.fit(x_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [27]:
y_pred = lr.predict(x_test)

In [28]:
y_pred[5]

array([25.52268175])

In [29]:
y_test[5]  #residue error of 25.52 - 24 = 1.52

array([24.])

# R^2

In [30]:
lr.score(x_test,y_test)  #R^2 should be closer to 1 for the most optimal solution

0.7191751095635452

# Coefficients and Intercept

In [31]:
lr.coef_  #the various values of m1,m2,m3,m4,m5

array([[-0.10812512, -0.00798639, -0.01303601, -0.00577446,  0.22099423]])

In [32]:
lr.coef_.shape

(1, 5)

In [33]:
lr.intercept_  #the value of intercept

array([40.62162863])

In [34]:
lr.intercept_.shape

(1,)

In [35]:
car.head(1)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration
0,18.0,8,307.0,130,3504,12.0


# Final Equation

In [36]:
print('The equation generated in this modeling is: \n\n Miles per Gallon = {0} * CYLINDER + {1} * DISPLACEMENT + {2} * HORSEPOWER + {3} * WEIGHT + {4} * ACCELERATION + {5}'.format(lr.coef_[0,0],lr.coef_[0,1],lr.coef_[0,2],lr.coef_[0,3],lr.coef_[0,4],lr.intercept_[0]))

The equation generated in this modeling is: 

 Miles per Gallon = -0.1081251165350353 * CYLINDER + -0.007986391542518165 * DISPLACEMENT + -0.013036006473626549 * HORSEPOWER + -0.005774455480094679 * WEIGHT + 0.2209942317504866 * ACCELERATION + 40.621628632541345
