In [44]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [45]:
df = pd.read_csv("../datasets/auto-mpg.csv")
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86,2790,15.6,82,1,ford mustang gl
394,44.0,4,97.0,52,2130,24.6,82,2,vw pickup
395,32.0,4,135.0,84,2295,11.6,82,1,dodge rampage
396,28.0,4,120.0,79,2625,18.6,82,1,ford ranger


In [46]:
df.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model year      0
origin          0
car name        0
dtype: int64

In [47]:
df.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower       object
weight            int64
acceleration    float64
model year        int64
origin            int64
car name         object
dtype: object

In [48]:
df.head(2)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320


In [49]:
df['origin'] = df['origin'].replace({1:'america', 2:'europe', 3:'asia'})

In [50]:
df.head(2)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,america,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,america,buick skylark 320


In [51]:
df = df.drop('car name',axis = 1)

In [52]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
0,18.0,8,307.0,130,3504,12.0,70,america
1,15.0,8,350.0,165,3693,11.5,70,america
2,18.0,8,318.0,150,3436,11.0,70,america
3,16.0,8,304.0,150,3433,12.0,70,america
4,17.0,8,302.0,140,3449,10.5,70,america


In [53]:
df.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower       object
weight            int64
acceleration    float64
model year        int64
origin           object
dtype: object

In [55]:
hpisDigit = pd.DataFrame(df.horsepower.str.isdigit())

In [57]:
df[hpisDigit['horsepower'] == False]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
32,25.0,4,98.0,?,2046,19.0,71,america
126,21.0,6,200.0,?,2875,17.0,74,america
330,40.9,4,85.0,?,1835,17.3,80,europe
336,23.6,4,140.0,?,2905,14.3,80,america
354,34.5,4,100.0,?,2320,15.8,81,europe
374,23.0,4,151.0,?,3035,20.5,82,america


In [64]:
df = df.replace('?', np.nan)
df[hpisDigit['horsepower'] == False]


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
32,25.0,4,98.0,,2046,19.0,71,america
126,21.0,6,200.0,,2875,17.0,74,america
330,40.9,4,85.0,,1835,17.3,80,europe
336,23.6,4,140.0,,2905,14.3,80,america
354,34.5,4,100.0,,2320,15.8,81,europe
374,23.0,4,151.0,,3035,20.5,82,america


In [67]:
df.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model year      0
origin          0
dtype: int64

In [73]:
medianCheack = df.drop('origin', axis = 1)
medianCheack

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year
0,18.0,8,307.0,130,3504,12.0,70
1,15.0,8,350.0,165,3693,11.5,70
2,18.0,8,318.0,150,3436,11.0,70
3,16.0,8,304.0,150,3433,12.0,70
4,17.0,8,302.0,140,3449,10.5,70
...,...,...,...,...,...,...,...
393,27.0,4,140.0,86,2790,15.6,82
394,44.0,4,97.0,52,2130,24.6,82
395,32.0,4,135.0,84,2295,11.6,82
396,28.0,4,120.0,79,2625,18.6,82


In [74]:
medianCheack.median()

mpg               23.0
cylinders          4.0
displacement     148.5
horsepower        93.5
weight          2803.5
acceleration      15.5
model year        76.0
dtype: object

# to replace NaN value with -> median

In [75]:
df['horsepower'] = df['horsepower'].fillna(df['horsepower'].median())

In [76]:
df.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model year      0
origin          0
dtype: int64

In [80]:
df[hpisDigit['horsepower'] == False]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
32,25.0,4,98.0,93.5,2046,19.0,71,america
126,21.0,6,200.0,93.5,2875,17.0,74,america
330,40.9,4,85.0,93.5,1835,17.3,80,europe
336,23.6,4,140.0,93.5,2905,14.3,80,america
354,34.5,4,100.0,93.5,2320,15.8,81,europe
374,23.0,4,151.0,93.5,3035,20.5,82,america


# above values are replaced with median value

In [79]:
df.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower       object
weight            int64
acceleration    float64
model year        int64
origin           object
dtype: object

# we have to conver Horsepower -> object to --> float

In [81]:
df['horsepower'] = df['horsepower'].astype('float64')

In [83]:
df.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower      float64
weight            int64
acceleration    float64
model year        int64
origin           object
dtype: object

# to create dummies variables

In [84]:
df = pd.get_dummies(df,['origin'])

In [85]:
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin_america,origin_asia,origin_europe
0,18.0,8,307.0,130.0,3504,12.0,70,True,False,False
1,15.0,8,350.0,165.0,3693,11.5,70,True,False,False
2,18.0,8,318.0,150.0,3436,11.0,70,True,False,False
3,16.0,8,304.0,150.0,3433,12.0,70,True,False,False
4,17.0,8,302.0,140.0,3449,10.5,70,True,False,False
...,...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,True,False,False
394,44.0,4,97.0,52.0,2130,24.6,82,False,False,True
395,32.0,4,135.0,84.0,2295,11.6,82,True,False,False
396,28.0,4,120.0,79.0,2625,18.6,82,True,False,False


In [86]:
df.dtypes

mpg               float64
cylinders           int64
displacement      float64
horsepower        float64
weight              int64
acceleration      float64
model year          int64
origin_america       bool
origin_asia          bool
origin_europe        bool
dtype: object

In [88]:
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin_america,origin_asia,origin_europe
0,18.0,8,307.0,130.0,3504,12.0,70,True,False,False
1,15.0,8,350.0,165.0,3693,11.5,70,True,False,False
2,18.0,8,318.0,150.0,3436,11.0,70,True,False,False
3,16.0,8,304.0,150.0,3433,12.0,70,True,False,False
4,17.0,8,302.0,140.0,3449,10.5,70,True,False,False
...,...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,True,False,False
394,44.0,4,97.0,52.0,2130,24.6,82,False,False,True
395,32.0,4,135.0,84.0,2295,11.6,82,True,False,False
396,28.0,4,120.0,79.0,2625,18.6,82,True,False,False


In [89]:
x = df.drop('mpg', axis = 1)
y = df['mpg']

In [92]:
x.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model year,origin_america,origin_asia,origin_europe
0,8,307.0,130.0,3504,12.0,70,True,False,False
1,8,350.0,165.0,3693,11.5,70,True,False,False
2,8,318.0,150.0,3436,11.0,70,True,False,False
3,8,304.0,150.0,3433,12.0,70,True,False,False
4,8,302.0,140.0,3449,10.5,70,True,False,False


In [93]:
y.head()

0    18.0
1    15.0
2    18.0
3    16.0
4    17.0
Name: mpg, dtype: float64

In [94]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.25, random_state=0)

In [95]:
y_test

65     14.0
132    25.0
74     13.0
78     21.0
37     18.0
       ... 
286    17.6
263    17.7
146    28.0
259    20.8
63     14.0
Name: mpg, Length: 100, dtype: float64

In [96]:
model = LinearRegression()
model

In [97]:
model.fit(x_train, y_train)

In [102]:
model.score(x_train,y_train)*100

82.20975257022599

In [101]:
model.score(x_test, y_test)*100

82.09521517724224

In [103]:
model.coef_

array([-0.3238232 ,  0.02488104, -0.02168627, -0.00671939,  0.20518937,
        0.75768197, -1.96630356,  1.30485497,  0.66144859])

In [104]:
model.intercept_

-17.391065915907518

In [108]:
model.predict(x)

array([14.82656052, 13.76486545, 14.91825598, 14.795269  , 14.54707526,
       10.35286884, 10.20525534, 10.14497204,  9.84981739, 12.54743716,
       15.04324559, 13.47075275, 14.46691386, 18.84708667, 23.5470326 ,
       18.74806974, 19.12602251, 20.77026026, 24.82423867, 28.30501328,
       21.49962834, 22.37045177, 23.17251212, 23.12662467, 20.02187543,
        7.24705397,  8.06477765,  7.77350594,  6.46799678, 25.58192064,
       22.64206472, 25.06711801, 23.70414997, 21.06746174, 15.88872395,
       17.35831467, 17.79997357, 17.18595115, 11.15793452, 10.36908345,
       12.12040157, 11.6515163 ,  6.53845172,  8.68524023,  5.92936743,
       19.39613932, 22.78298759, 17.57153152, 18.69004551, 22.26882041,
       25.31232253, 26.2832414 , 25.41158081, 28.75597389, 29.5640236 ,
       28.4580228 , 24.95885903, 25.79661431, 23.79422577, 27.44702887,
       23.25291137, 23.49915945, 11.47885585, 11.76019227, 12.24973658,
       12.94347372, 14.60210296,  9.89449624, 10.47148066, 10.67

In [111]:
import seaborn as sns