# Data preparation

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('data.csv')
df

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11909,Acura,ZDX,2012,premium unleaded (required),300.0,6.0,AUTOMATIC,all wheel drive,4.0,"Crossover,Hatchback,Luxury",Midsize,4dr Hatchback,23,16,204,46120
11910,Acura,ZDX,2012,premium unleaded (required),300.0,6.0,AUTOMATIC,all wheel drive,4.0,"Crossover,Hatchback,Luxury",Midsize,4dr Hatchback,23,16,204,56670
11911,Acura,ZDX,2012,premium unleaded (required),300.0,6.0,AUTOMATIC,all wheel drive,4.0,"Crossover,Hatchback,Luxury",Midsize,4dr Hatchback,23,16,204,50620
11912,Acura,ZDX,2013,premium unleaded (recommended),300.0,6.0,AUTOMATIC,all wheel drive,4.0,"Crossover,Hatchback,Luxury",Midsize,4dr Hatchback,23,16,204,50920


In [3]:
hw = ['Make', 'Model', 'Year', 'Engine HP', 'Engine Cylinders', 'Transmission Type', 'Vehicle Style', 'highway MPG', 'city mpg', 'MSRP']
df = df[hw]
df = df.fillna('0')
df.isnull().sum()

Make                 0
Model                0
Year                 0
Engine HP            0
Engine Cylinders     0
Transmission Type    0
Vehicle Style        0
highway MPG          0
city mpg             0
MSRP                 0
dtype: int64

In [4]:
df.columns = df.columns.str.replace(" ","_").str.lower()
df.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,msrp
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500


In [5]:
df.rename(columns={'msrp':'price'}, inplace=True)
df

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500
...,...,...,...,...,...,...,...,...,...,...
11909,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,46120
11910,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,56670
11911,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50620
11912,Acura,ZDX,2013,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50920


In [6]:
df.transmission_type.mode()

0    AUTOMATIC
Name: transmission_type, dtype: object

# Binary price

In [7]:
mean_price = df.price.mean()
df['above_average'] = (df.price > mean_price).astype(int)
df

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price,above_average
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135,1
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650,1
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350,0
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450,0
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500,0
...,...,...,...,...,...,...,...,...,...,...,...
11909,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,46120,1
11910,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,56670,1
11911,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50620,1
11912,Acura,ZDX,2013,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50920,1


# Correlation matrix

In [8]:
df_numerical = df.select_dtypes(include=np.number)
df_numerical

Unnamed: 0,year,highway_mpg,city_mpg,price,above_average
0,2011,26,19,46135,1
1,2011,28,19,40650,1
2,2011,28,20,36350,0
3,2011,28,18,29450,0
4,2011,28,18,34500,0
...,...,...,...,...,...
11909,2012,23,16,46120,1
11910,2012,23,16,56670,1
11911,2012,23,16,50620,1
11912,2013,23,16,50920,1


In [9]:
corr_matrix = df_numerical.corr()
corr_matrix

Unnamed: 0,year,highway_mpg,city_mpg,price,above_average
year,1.0,0.25824,0.198171,0.22759,0.318311
highway_mpg,0.25824,1.0,0.886829,-0.160043,-0.13344
city_mpg,0.198171,0.886829,1.0,-0.157676,-0.149233
price,0.22759,-0.160043,-0.157676,1.0,0.481489
above_average,0.318311,-0.13344,-0.149233,0.481489,1.0


# Validation framework

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
np.random.seed(42)
df_full_train , df_test = train_test_split(df, test_size=0.2)
df_train, df_val = train_test_split(df_full_train, test_size=0.25)

In [32]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

len(df_train), len(df_val), len(df_test)

(7148, 2383, 2383)

In [13]:
y_train = df_train.above_average.values
y_val = df_val.above_average.values
y_test = df_test.above_average.values

del df_train['above_average']
del df_val['above_average']
del df_test['above_average']

# Mutual score info

In [14]:
from sklearn.metrics import mutual_info_score

In [17]:
for column in df_train[['make', 'model', 'transmission_type', 'vehicle_style']]:
    print(column)
    print(mutual_info_score(df_train[column], y_train))

make
0.24124550337806772
model
0.465818783986574
transmission_type
0.020424575994791604
vehicle_style
0.08469319196603653


# Logistic regression

In [18]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [19]:
def linear_regression(xi):
    result = w0
    
    for j in range(len(w)):
        result += xi[j] * w[j]
    return result

In [20]:
def logistic_regression(xi):
    result = w0
    for j in range(len(w)):
        result = result + xi[j] * w[j]
    return sigmoid(result)

# One-hot encoding

In [25]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [38]:
dvt = DictVectorizer(sparse=False)

train_dict = df_train.to_dict(orient='records')
X_train = dvt.fit_transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dvt.fit_transform(val_dict)

model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

len(X_train), len(X_val), len(y_val)

(7148, 2383, 2383)

In [31]:
y_pred = model.predict_proba(X_val)[:, 1]
price_above = (y_pred >= 0.5)

ValueError: X has 766 features, but LogisticRegression is expecting 960 features as input.

In [None]:
df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = price_above.astype(int)
df_pred['actual'] = y_val
df_pred['correct'] = df_pred.prediction == df_pred.actual
df_pred.correct.mean()