In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,r2_score

In [38]:
df=pd.read_csv('https://gist.githubusercontent.com/omarish/5687264/raw/7e5c814ce6ef33e25d5259c1fe79463c190800d9/mpg.csv')

## Target variable - mpg


In [39]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


### data cleaning


In [40]:
df.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower       object
weight            int64
acceleration    float64
model_year        int64
origin            int64
name             object
dtype: object

In [41]:
df.horsepower.unique() # we can see in the dataset that horsepower has '?'

array(['130', '165', '150', '140', '198', '220', '215', '225', '190',
       '170', '160', '95', '97', '85', '88', '46', '87', '90', '113',
       '200', '210', '193', '?', '100', '105', '175', '153', '180', '110',
       '72', '86', '70', '76', '65', '69', '60', '80', '54', '208', '155',
       '112', '92', '145', '137', '158', '167', '94', '107', '230', '49',
       '75', '91', '122', '67', '83', '78', '52', '61', '93', '148',
       '129', '96', '71', '98', '115', '53', '81', '79', '120', '152',
       '102', '108', '68', '58', '149', '89', '63', '48', '66', '139',
       '103', '125', '133', '138', '135', '142', '77', '62', '132', '84',
       '64', '74', '116', '82'], dtype=object)

In [42]:
df['horsepower']=pd.to_numeric(df['horsepower'],errors='coerce') # replacing '?' with null value

In [43]:
df.isnull().any()

mpg             False
cylinders       False
displacement    False
horsepower       True
weight          False
acceleration    False
model_year      False
origin          False
name            False
dtype: bool

In [44]:
print(df[pd.to_numeric(df['horsepower'],errors='coerce').isnull()])

      mpg  cylinders  displacement  horsepower  weight  acceleration  \
32   25.0          4          98.0         NaN    2046          19.0   
126  21.0          6         200.0         NaN    2875          17.0   
330  40.9          4          85.0         NaN    1835          17.3   
336  23.6          4         140.0         NaN    2905          14.3   
354  34.5          4         100.0         NaN    2320          15.8   
374  23.0          4         151.0         NaN    3035          20.5   

     model_year  origin                  name  
32           71       1            ford pinto  
126          74       1         ford maverick  
330          80       2  renault lecar deluxe  
336          80       1    ford mustang cobra  
354          81       2           renault 18i  
374          82       1        amc concord dl  


In [45]:
# drop the rows which has NaN values

In [46]:
df=df.dropna()

In [48]:
df.shape

(392, 9)

In [49]:
# drop all the unnecessary columns

In [53]:
df=df.drop(['name','origin','model_year'],axis=1)

In [54]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration
0,18.0,8,307.0,130.0,3504,12.0
1,15.0,8,350.0,165.0,3693,11.5
2,18.0,8,318.0,150.0,3436,11.0
3,16.0,8,304.0,150.0,3433,12.0
4,17.0,8,302.0,140.0,3449,10.5


In [58]:
df.info() # information of the dataset

<class 'pandas.core.frame.DataFrame'>
Int64Index: 392 entries, 0 to 397
Data columns (total 6 columns):
mpg             392 non-null float64
cylinders       392 non-null int64
displacement    392 non-null float64
horsepower      392 non-null float64
weight          392 non-null int64
acceleration    392 non-null float64
dtypes: float64(4), int64(2)
memory usage: 21.4 KB


In [60]:
df.describe()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration
count,392.0,392.0,392.0,392.0,392.0,392.0
mean,23.445918,5.471939,194.41199,104.469388,2977.584184,15.541327
std,7.805007,1.705783,104.644004,38.49116,849.40256,2.758864
min,9.0,3.0,68.0,46.0,1613.0,8.0
25%,17.0,4.0,105.0,75.0,2225.25,13.775
50%,22.75,4.0,151.0,93.5,2803.5,15.5
75%,29.0,8.0,275.75,126.0,3614.75,17.025
max,46.6,8.0,455.0,230.0,5140.0,24.8


## now the data is cleaned and we can apply linear regression

In [65]:
x=df.drop('mpg',axis=1)
y=df[['mpg']]

In [None]:
# now we basically train and test the dataset

In [69]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=1)

In [95]:
reg=LinearRegression()
reg.fit(x_train[['horsepower']],y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [96]:
print('intercept :',reg.intercept_)

intercept : [39.50084926]


In [97]:
print('slope :',reg.coef_)

slope : [[-0.15722932]]


In [98]:
y_pred=reg.predict(x_test[['horsepower']])

In [94]:
r2_score(y_test,y_pred)

0.7365927620841707

In [113]:
reg=LinearRegression()
reg.fit(x_train[['horsepower','weight','cylinders']],y_train)
y_predicted=reg.predict(x_test[['horsepower','weight','cylinders']])
print('mean squared error :',mean_squared_error(y_test,y_predicted))
print('r2_score:', r2_score( y_test,y_predicted))


mean squared error : 18.78809912757255
r2_score: 0.7365927620841709
