# Feature Selection

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression

In [2]:
df = pd.read_csv("cars.csv")

In [3]:
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,body-style,drive-wheels,engine-location,width,height,engine-type,engine-size,horsepower,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111,21,27,13495
1,3,?,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111,21,27,16500
2,1,?,alfa-romero,gas,hatchback,rwd,front,65.5,52.4,ohcv,152,154,19,26,16500
3,2,164,audi,gas,sedan,fwd,front,66.2,54.3,ohc,109,102,24,30,13950
4,2,164,audi,gas,sedan,4wd,front,66.4,54.3,ohc,136,115,18,22,17450


In [4]:
df["normalized-losses"].replace("?",np.nan,inplace=True)
df["normalized-losses"] = df["normalized-losses"].astype(float)
losses_mean = df["normalized-losses"].mean()
df["normalized-losses"].fillna(losses_mean,inplace=True)

In [5]:
df["horsepower"].replace("?",np.nan,inplace=True)
df["horsepower"] = df["horsepower"].astype(float)
horsepower_mean = df["horsepower"].mean()
df["horsepower"].fillna(horsepower_mean,inplace=True)

In [6]:
df_num = df.select_dtypes(["int64","float64"])
df_cat = df.select_dtypes("object")

In [7]:
for col in df_cat:
    le = LabelEncoder()
    df_cat[col] = le.fit_transform(df_cat[col])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [8]:
df_new = pd.concat([df_num,df_cat],axis=1)

In [9]:
df_new.head()

Unnamed: 0,symboling,normalized-losses,width,height,engine-size,horsepower,city-mpg,highway-mpg,price,make,fuel-type,body-style,drive-wheels,engine-location,engine-type
0,3,122.0,64.1,48.8,130,111.0,21,27,13495,0,1,0,2,0,0
1,3,122.0,64.1,48.8,130,111.0,21,27,16500,0,1,0,2,0,0
2,1,122.0,65.5,52.4,152,154.0,19,26,16500,0,1,2,2,0,5
3,2,164.0,66.2,54.3,109,102.0,24,30,13950,1,1,3,1,0,3
4,2,164.0,66.4,54.3,136,115.0,18,22,17450,1,1,3,0,0,3


In [10]:
X = df_new.drop("price",axis=1)
y = df_new["price"]

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=1)

In [11]:
# Baseline model
lin = LinearRegression()

lin.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [12]:
lin.score(X_test,y_test)

0.7965566780397383

## Chi2 and Annova Test

In [13]:
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import SelectKBest

In [14]:
annova = SelectKBest(score_func=f_regression,k=10)

# Perform chi2 on non-negative values
#chi = SelectKBest(score_func=chi2,k=7)

In [15]:
X_train_f = annova.fit_transform(X_train,y_train)

In [16]:
X_test_f = annova.transform(X_test)

In [17]:
lin_annova = LinearRegression()

In [18]:
lin_annova.fit(X_train_f,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [19]:
lin_annova.score(X_test_f,y_test)

0.7886118767839588

## Wrapper method (Forward selection) 

In [20]:
def adjusted_r2(k,r2,n):
    adj_r2 = 1 - (1-r2)*((n-1)/(n-(k+1)))
    return adj_r2

In [21]:
columns = []
i = 1
for col in X:
    columns.append(col)
    
    X_new = df_new[columns]
    
    X_train,X_test,y_train,y_test = train_test_split(X_new,y,test_size=0.3,random_state=1)
    
    lin = LinearRegression()
    
    lin.fit(X_train,y_train)
    
    r2 = lin.score(X_test,y_test)
    
    n = len(X_test)
    
    adj_r2 = adjusted_r2(i,r2,n)
    
    print("cols: ",i,"r2 score: ",r2,"adjusted r2: ",adj_r2)
    
    i += 1
    

cols:  1 r2 score:  -0.0017837050450488778 adjusted r2:  -0.018480100129133037
cols:  2 r2 score:  0.041612508167990114 adjusted r2:  0.00912479658046439
cols:  3 r2 score:  0.617173756908543 adjusted r2:  0.5973723995072608
cols:  4 r2 score:  0.6183283499764456 adjusted r2:  0.5915443745361961
cols:  5 r2 score:  0.7589560567217415 adjusted r2:  0.737434276071897
cols:  6 r2 score:  0.7741546338326668 adjusted r2:  0.7495169575235032
cols:  7 r2 score:  0.776896144188328 adjusted r2:  0.7479752739905187
cols:  8 r2 score:  0.7777776382689499 adjusted r2:  0.7442346402718103
cols:  9 r2 score:  0.7924572281432338 adjusted r2:  0.7565363637834089
cols:  10 r2 score:  0.7948545416112252 adjusted r2:  0.7546299419271516
cols:  11 r2 score:  0.7972954924014395 adjusted r2:  0.7527005007297561
cols:  12 r2 score:  0.8135463420259217 adjusted r2:  0.7678842217057392
cols:  13 r2 score:  0.7934456703438276 adjusted r2:  0.7375038727286143
cols:  14 r2 score:  0.7965566780397383 adjusted r2: 

## Principal component analysis 

In [22]:
from sklearn.decomposition import PCA

In [23]:
pc = PCA(n_components=4,random_state=1)

In [24]:
X = df_new.drop("price",axis=1)
y = df_new["price"]

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=1)

In [25]:
X_train_pc = pc.fit_transform(X_train,y_train)
X_test_pc = pc.transform(X_test)

In [26]:
lin = LinearRegression()
lin.fit(X_train_pc,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [27]:
lin.score(X_test_pc,y_test)

0.7764931460093277