In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression

In [2]:
df = pd.read_csv("cars.csv")

In [3]:
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,body-style,drive-wheels,engine-location,width,height,engine-type,engine-size,horsepower,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111,21,27,13495
1,3,?,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111,21,27,16500
2,1,?,alfa-romero,gas,hatchback,rwd,front,65.5,52.4,ohcv,152,154,19,26,16500
3,2,164,audi,gas,sedan,fwd,front,66.2,54.3,ohc,109,102,24,30,13950
4,2,164,audi,gas,sedan,4wd,front,66.4,54.3,ohc,136,115,18,22,17450


In [4]:
# Removing junk data from the dataset
df["normalized-losses"].replace("?",np.nan,inplace=True)
df["normalized-losses"] = df["normalized-losses"].astype(float)
losses_mean = df["normalized-losses"].mean()
df["normalized-losses"].fillna(losses_mean,inplace=True)

In [5]:
df["horsepower"].replace("?",np.nan,inplace=True)
df["horsepower"] = df["horsepower"].astype(float)
horsepower_mean = df["horsepower"].mean()
df["horsepower"].fillna(horsepower_mean,inplace=True)

In [6]:
# Adding Labels to categorical data
df_num = df.select_dtypes(["int64","float64"])
df_cat = df.select_dtypes("object")
for col in df_cat:
    le = LabelEncoder()
    df_cat[col] = le.fit_transform(df_cat[col])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [7]:
# Creating new dataframe with changed values of the categorical data
df_new = pd.concat([df_num,df_cat],axis=1)

In [8]:
df_new.head()

Unnamed: 0,symboling,normalized-losses,width,height,engine-size,horsepower,city-mpg,highway-mpg,price,make,fuel-type,body-style,drive-wheels,engine-location,engine-type
0,3,122.0,64.1,48.8,130,111.0,21,27,13495,0,1,0,2,0,0
1,3,122.0,64.1,48.8,130,111.0,21,27,16500,0,1,0,2,0,0
2,1,122.0,65.5,52.4,152,154.0,19,26,16500,0,1,2,2,0,5
3,2,164.0,66.2,54.3,109,102.0,24,30,13950,1,1,3,1,0,3
4,2,164.0,66.4,54.3,136,115.0,18,22,17450,1,1,3,0,0,3


In [9]:
#Splitting dataset for the baseline model
X = df_new.drop("price",axis=1)
y = df_new["price"]

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=1)

In [10]:
# Baseline model
lin = LinearRegression()
lin.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [11]:
lin.score(X_test,y_test)

0.7965566780397382

### Filter Methods
1. Pearson's Corelation
2. Chi Squared test
3. ANOVA test

In [13]:
#score_func of Chi Squared Test
from sklearn.feature_selection import chi2
# score_func of Annova
from sklearn.feature_selection import f_regression
# class that accepts the score_func parameters
from sklearn.feature_selection import SelectKBest

In [14]:
pc = SelectKBest(score_func=f_regression,k=10)

In [15]:
X_train_pc = pc.fit_transform(X_train,y_train)
X_test_pc = pc.transform(X_test)
lin_PC = LinearRegression()
lin_PC.fit(X_train_pc, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [16]:
lin_PC.score(X_test_pc, y_test)

0.7886118767839589

In [19]:
chi = SelectKBest(score_func=chi2,k=7)

# this will throw an error as it works on non-negative values
X_train_chi = chi.fit_transform(X_train,y_train)

X_test_chi = chi.transform(X_test)
lin_chi = LinearRegression()
lin_chi.fit(X_train_chi, y_train)

ValueError: Input X must be non-negative.