In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
data=pd.read_csv("auto-mpg.csv")

In [3]:
data.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [4]:
data=data.drop('car name',axis=1)

In [5]:
#hp
temp=pd.DataFrame(data.horsepower.str.isdigit())
data[temp['horsepower']==False]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
32,25.0,4,98.0,?,2046,19.0,71,1
126,21.0,6,200.0,?,2875,17.0,74,1
330,40.9,4,85.0,?,1835,17.3,80,2
336,23.6,4,140.0,?,2905,14.3,80,1
354,34.5,4,100.0,?,2320,15.8,81,2
374,23.0,4,151.0,?,3035,20.5,82,1


In [6]:
data=data.replace('?',np.nan)
data[temp['horsepower']==False]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
32,25.0,4,98.0,,2046,19.0,71,1
126,21.0,6,200.0,,2875,17.0,74,1
330,40.9,4,85.0,,1835,17.3,80,2
336,23.6,4,140.0,,2905,14.3,80,1
354,34.5,4,100.0,,2320,15.8,81,2
374,23.0,4,151.0,,3035,20.5,82,1


In [7]:
medianfiller=lambda x:x.fillna(x.median())

In [8]:
data=data.apply(medianfiller)

In [9]:
data[temp['horsepower']==False]
data['horsepower']=data['horsepower'].astype('float64')

In [10]:
x=data.drop('mpg',axis=1)

In [11]:
y=data['mpg']

In [12]:
# STEPS OF PCA 
#we use standard scaler to normalize our data.
s=StandardScaler()
x_std=s.fit_transform(x)

In [13]:
cov_matrix=np.cov(x_std.T)

In [14]:
cov_matrix

array([[ 1.00251889,  0.95311615,  0.84340357,  0.89827376, -0.50669259,
        -0.34962425, -0.56396033],
       [ 0.95311615,  1.00251889,  0.89803457,  0.93517383, -0.54505356,
        -0.37109656, -0.61094444],
       [ 0.84340357,  0.89803457,  1.00251889,  0.86461476, -0.68831918,
        -0.41477495, -0.45323458],
       [ 0.89827376,  0.93517383,  0.86461476,  1.00251889, -0.41850885,
        -0.30733654, -0.58248745],
       [-0.50669259, -0.54505356, -0.68831918, -0.41850885,  1.00251889,
         0.28886274,  0.20639158],
       [-0.34962425, -0.37109656, -0.41477495, -0.30733654,  0.28886274,
         1.00251889,  0.18111726],
       [-0.56396033, -0.61094444, -0.45323458, -0.58248745,  0.20639158,
         0.18111726,  1.00251889]])

In [15]:
eigenvalues,eigenvectors=np.linalg.eig(cov_matrix)

In [16]:
print("eigen values:",eigenvalues)

eigen values: [4.61375285 0.94263114 0.75056808 0.48239402 0.13258963 0.0336822
 0.06201432]


In [17]:
print("eigen vectors",eigenvectors)

eigen vectors [[ 0.43855264  0.1144845  -0.02689113 -0.2430079  -0.69425676 -0.45266412
  -0.21884163]
 [ 0.45386128  0.10580212 -0.02465943 -0.16082161 -0.17706048  0.84999716
  -0.0499858 ]
 [ 0.43755724 -0.14144964 -0.17639693 -0.11774532  0.5934737  -0.15543464
  -0.6059578 ]
 [ 0.43219091  0.20336634  0.00434321 -0.33489636  0.33523145 -0.20773606
   0.71003121]
 [-0.29772584  0.48640295  0.54872031 -0.53485615  0.12655828  0.02384225
  -0.26649758]
 [-0.21488225  0.63315133 -0.73994607 -0.02603235  0.02044359  0.00996896
  -0.06503704]
 [-0.29769303 -0.52617954 -0.34484761 -0.70874202 -0.07504026  0.06795576
   0.05504373]]


In [18]:
#we are pairing eigen values and eigen vector
eig_pairs=[(eigenvalues[index],eigenvectors[:,index])for index in range(len(eigenvalues))]

In [19]:
#sort
eig_pairs.sort()

In [20]:
print(eig_pairs)

[(0.033682201103135065, array([-0.45266412,  0.84999716, -0.15543464, -0.20773606,  0.02384225,
        0.00996896,  0.06795576])), (0.06201432270087993, array([-0.21884163, -0.0499858 , -0.6059578 ,  0.71003121, -0.26649758,
       -0.06503704,  0.05504373])), (0.13258963399280732, array([-0.69425676, -0.17706048,  0.5934737 ,  0.33523145,  0.12655828,
        0.02044359, -0.07504026])), (0.4823940178801017, array([-0.2430079 , -0.16082161, -0.11774532, -0.33489636, -0.53485615,
       -0.02603235, -0.70874202])), (0.7505680761984045, array([-0.02689113, -0.02465943, -0.17639693,  0.00434321,  0.54872031,
       -0.73994607, -0.34484761])), (0.9426311428862395, array([ 0.1144845 ,  0.10580212, -0.14144964,  0.20336634,  0.48640295,
        0.63315133, -0.52617954])), (4.613752847052041, array([ 0.43855264,  0.45386128,  0.43755724,  0.43219091, -0.29772584,
       -0.21488225, -0.29769303]))]


In [21]:
eig_pairs.reverse()

In [22]:
print(eig_pairs)

[(4.613752847052041, array([ 0.43855264,  0.45386128,  0.43755724,  0.43219091, -0.29772584,
       -0.21488225, -0.29769303])), (0.9426311428862395, array([ 0.1144845 ,  0.10580212, -0.14144964,  0.20336634,  0.48640295,
        0.63315133, -0.52617954])), (0.7505680761984045, array([-0.02689113, -0.02465943, -0.17639693,  0.00434321,  0.54872031,
       -0.73994607, -0.34484761])), (0.4823940178801017, array([-0.2430079 , -0.16082161, -0.11774532, -0.33489636, -0.53485615,
       -0.02603235, -0.70874202])), (0.13258963399280732, array([-0.69425676, -0.17706048,  0.5934737 ,  0.33523145,  0.12655828,
        0.02044359, -0.07504026])), (0.06201432270087993, array([-0.21884163, -0.0499858 , -0.6059578 ,  0.71003121, -0.26649758,
       -0.06503704,  0.05504373])), (0.033682201103135065, array([-0.45266412,  0.84999716, -0.15543464, -0.20773606,  0.02384225,
        0.00996896,  0.06795576]))]


In [23]:
eigenvalues_sorted=[eig_pairs[index][0]for index in range(len(eigenvalues))]

In [24]:
eigenvalues_sorted

[4.613752847052041,
 0.9426311428862395,
 0.7505680761984045,
 0.4823940178801017,
 0.13258963399280732,
 0.06201432270087993,
 0.033682201103135065]

In [26]:
eigenvectors_sorted=[eig_pairs[index][1]for index in range (len(eigenvalues))]

In [27]:
eigenvectors_sorted

[array([ 0.43855264,  0.45386128,  0.43755724,  0.43219091, -0.29772584,
        -0.21488225, -0.29769303]),
 array([ 0.1144845 ,  0.10580212, -0.14144964,  0.20336634,  0.48640295,
         0.63315133, -0.52617954]),
 array([-0.02689113, -0.02465943, -0.17639693,  0.00434321,  0.54872031,
        -0.73994607, -0.34484761]),
 array([-0.2430079 , -0.16082161, -0.11774532, -0.33489636, -0.53485615,
        -0.02603235, -0.70874202]),
 array([-0.69425676, -0.17706048,  0.5934737 ,  0.33523145,  0.12655828,
         0.02044359, -0.07504026]),
 array([-0.21884163, -0.0499858 , -0.6059578 ,  0.71003121, -0.26649758,
        -0.06503704,  0.05504373]),
 array([-0.45266412,  0.84999716, -0.15543464, -0.20773606,  0.02384225,
         0.00996896,  0.06795576])]

In [28]:
#now we reduce the dimensions(project data in low dimensions)
p_reduce=np.array(eigenvectors_sorted[0:7])

In [29]:
x_std_4_dim=np.dot(x_std,p_reduce.T)

In [30]:
prj_data=pd.DataFrame(x_std_4_dim)

In [31]:
prj_data.head()

Unnamed: 0,0,1,2,3,4,5,6
0,2.667504,-0.964253,0.55678,0.412127,-0.765829,0.069401,-0.082557
1,3.406706,-1.09311,0.286226,0.260029,-0.242882,-0.29976,0.075157
2,3.018035,-1.220138,0.262181,0.554572,-0.546509,-0.213672,-0.066162
3,2.847389,-1.05848,0.46471,0.383185,-0.477944,-0.30623,-0.171039
4,2.894375,-1.284515,0.21263,0.702075,-0.69259,0.012035,-0.163561


In [32]:
x_train,x_test,y_train,y_test=train_test_split(prj_data,y,test_size=0.30,random_state=1)

In [33]:
from sklearn.linear_model import LinearRegression

In [34]:
linearregression=LinearRegression()
linearregression.fit(x_train,y_train)

LinearRegression()

In [35]:
linearregression.score(x_test,y_test)

0.8472274567567304

In [36]:
linearregression.score(x_train,y_train)

0.8081802739111359