In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score, mean_squared_error
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression,Ridge
from sklearn.preprocessing import LabelEncoder

%matplotlib inline

In [2]:
Data = pd.read_csv("data.csv")

In [3]:
Data.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [4]:
Data = Data[["Make",
"Model",
"Year",
"Engine HP",
"Engine Cylinders",
"Transmission Type",
"Vehicle Style",
"highway MPG",
"city mpg",
"MSRP"]]

In [5]:
Data = Data.fillna(0)

In [6]:
Data.columns = Data.columns.str.replace(' ', '_').str.lower()

In [7]:
Data.columns = Data.columns.str.replace('msrp', 'price')

In [8]:
Data.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500


# 1

In [9]:
Data.transmission_type.value_counts()

AUTOMATIC           8266
MANUAL              2935
AUTOMATED_MANUAL     626
DIRECT_DRIVE          68
UNKNOWN               19
Name: transmission_type, dtype: int64

# 2

In [10]:
Numerical = ["year","engine_hp","engine_cylinders","highway_mpg","city_mpg"]

In [11]:
Data[Numerical].corr()

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg
year,1.0,0.338714,-0.040708,0.25824,0.198171
engine_hp,0.338714,1.0,0.774851,-0.415707,-0.424918
engine_cylinders,-0.040708,0.774851,1.0,-0.614541,-0.587306
highway_mpg,0.25824,-0.415707,-0.614541,1.0,0.886829
city_mpg,0.198171,-0.424918,-0.587306,0.886829,1.0


# Data Spliting

In [12]:
Data["above_average"] = (Data.price > Data.price.mean()).astype(int)

In [13]:
NewData = Data[["make","model","year","engine_hp","engine_cylinders","transmission_type","vehicle_style","highway_mpg","city_mpg","above_average"]]

In [14]:
df_full_train, df_test = train_test_split(NewData, test_size=0.2,random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25,random_state=42)

In [15]:

y_train = df_train.above_average.values
y_val = df_val.above_average.values
y_test = df_test.above_average.values

del df_train['above_average']
del df_val['above_average']
del df_test['above_average']

# 3

In [16]:
def mutual_info_score_round(series):
    return round(mutual_info_score(series, y_train),2)

In [17]:
mi = df_train.apply(mutual_info_score_round)
mi.sort_values(ascending=False)

model                0.46
engine_hp            0.36
make                 0.24
engine_cylinders     0.12
vehicle_style        0.08
year                 0.07
city_mpg             0.06
highway_mpg          0.04
transmission_type    0.02
dtype: float64

# 4

In [18]:
dv = DictVectorizer(sparse=False)

train_dict = df_full_train.to_dict(orient='records')
dv.fit(train_dict)

train_dict = df_train.to_dict(orient='records')
X_train = dv.transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

test_dict = df_test.to_dict(orient='records')
X_test = dv.transform(test_dict)

In [19]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)

In [20]:
model.fit(X_train, y_train)

In [21]:
y_pred = model.predict_proba(X_val)[:, 1]

In [22]:
decision = (y_pred >= 0.5)

In [23]:
round((y_val == decision).mean(),2)

0.95

In [24]:
Acc = (y_val == decision).mean()

# 5 

In [25]:
F_list = ["make","model","year","engine_hp","engine_cylinders","transmission_type","vehicle_style","highway_mpg","city_mpg"]

In [26]:
for feature in F_list:
    
    df_full_train, df_test = train_test_split(NewData, test_size=0.2,random_state=42)
    df_train, df_val = train_test_split(df_full_train, test_size=0.25,random_state=42)

    y_train = df_train.above_average.values
    y_val = df_val.above_average.values
    y_test = df_test.above_average.values

    del df_train['above_average']
    del df_val['above_average']
    del df_test['above_average']
    
    del df_full_train[feature]
    del df_train[feature]
    del df_val[feature]
    
    dv = DictVectorizer(sparse=False)

    train_dict = df_full_train.to_dict(orient='records')
    dv.fit(train_dict)

    train_dict = df_train.to_dict(orient='records')
    X_train = dv.transform(train_dict)

    val_dict = df_val.to_dict(orient='records')
    X_val = dv.transform(val_dict)
    
    test_dict = df_test.to_dict(orient='records')
    X_test = dv.transform(test_dict)
    
    model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    
    y_pred = model.predict_proba(X_val)[:, 1]
    decision = (y_pred >= 0.5)
    print("Acc diff:",Acc - (y_val == decision).mean()," Excluded feature: ",feature)

Acc diff: 0.0020981955518253326  Excluded feature:  make
Acc diff: 0.023080151070079657  Excluded feature:  model
Acc diff: -0.0008392782207302663  Excluded feature:  year
Acc diff: 0.022240872849349502  Excluded feature:  engine_hp
Acc diff: 0.0  Excluded feature:  engine_cylinders
Acc diff: 0.0020981955518253326  Excluded feature:  transmission_type
Acc diff: 0.009232060428031819  Excluded feature:  vehicle_style
Acc diff: 0.0004196391103650221  Excluded feature:  highway_mpg
Acc diff: 0.0004196391103650221  Excluded feature:  city_mpg


# 6

In [27]:
NewData = Data[["make","model","year","engine_hp","engine_cylinders","transmission_type","vehicle_style","highway_mpg","city_mpg","price"]]

In [28]:
def rmse(y, y_pred):
    v = y-y_pred
    v = v**2 
    v = v.mean()
    v = np.sqrt(v)
    return v

In [29]:
df_full_train, df_test = train_test_split(NewData, test_size=0.2,random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25,random_state=42)

y_train = np.log1p(df_train.price.values)
y_val = np.log1p(df_val.price.values)
y_test = np.log1p(df_test.price.values)

del df_train['price']
del df_val['price']
del df_test['price']


dv = DictVectorizer(sparse=False)

train_dict = df_full_train.to_dict(orient='records')
dv.fit(train_dict)

train_dict = df_train.to_dict(orient='records')
X_train = dv.transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)
    
test_dict = df_test.to_dict(orient='records')
X_test = dv.transform(test_dict)

In [30]:
alphas = [0, 0.01, 0.1, 1, 10]
for alpha_value in alphas:
    
    model = Ridge(alpha=alpha_value,solver='sag', max_iter=1000, random_state=42)
    
    model.fit(X_train, y_train)
    print("RMSE: ",round(rmse(y_val,model.predict(X_val)),3)," Alpha: ",alpha_value)
    
    



RMSE:  0.487  Alpha:  0




RMSE:  0.487  Alpha:  0.01




RMSE:  0.487  Alpha:  0.1




RMSE:  0.487  Alpha:  1
RMSE:  0.487  Alpha:  10


