In [105]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression, Ridge 
from sklearn.metrics import accuracy_score, mean_squared_error, mutual_info_score
from sklearn.model_selection import train_test_split

In [101]:
df = pd.read_csv("../02_regression/data/data.csv")
print(df.shape)
df.info()  

(11914, 16)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11914 entries, 0 to 11913
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Make               11914 non-null  object 
 1   Model              11914 non-null  object 
 2   Year               11914 non-null  int64  
 3   Engine Fuel Type   11911 non-null  object 
 4   Engine HP          11845 non-null  float64
 5   Engine Cylinders   11884 non-null  float64
 6   Transmission Type  11914 non-null  object 
 7   Driven_Wheels      11914 non-null  object 
 8   Number of Doors    11908 non-null  float64
 9   Market Category    8172 non-null   object 
 10  Vehicle Size       11914 non-null  object 
 11  Vehicle Style      11914 non-null  object 
 12  highway MPG        11914 non-null  int64  
 13  city mpg           11914 non-null  int64  
 14  Popularity         11914 non-null  int64  
 15  MSRP               11914 non-null  int64  
dtypes: float64

In [102]:
cols_to_leave = ["Make","Model","Year","Engine HP","Engine Cylinders","Transmission Type","Vehicle Style","highway MPG","city mpg","MSRP"]
df = df[cols_to_leave]
df.columns = df.columns.str.replace(' ', '_').str.lower()
df = df.fillna(0)
df = df.rename(columns={"MSRP".lower(): "price"})

## Q1

In [79]:
df.transmission_type.mode()

0    AUTOMATIC
Name: transmission_type, dtype: object

## Q2

In [80]:
df[['engine_hp', 'year', 'engine_cylinders', 'highway_mpg', 'city_mpg']].corr()

Unnamed: 0,engine_hp,year,engine_cylinders,highway_mpg,city_mpg
engine_hp,1.0,0.338714,0.774851,-0.415707,-0.424918
year,0.338714,1.0,-0.040708,0.25824,0.198171
engine_cylinders,0.774851,-0.040708,1.0,-0.614541,-0.587306
highway_mpg,-0.415707,0.25824,-0.614541,1.0,0.886829
city_mpg,-0.424918,0.198171,-0.587306,0.886829,1.0


## Q3

In [81]:
df["above_average"] = (df.price > df.price.mean()).astype(int)
df = df.drop("price", axis=1)
print(df.above_average.value_counts())

0    8645
1    3269
Name: above_average, dtype: int64


In [82]:
df_full_train, dfts = train_test_split(df, test_size=0.2, random_state=42)
dftr, dfvl = train_test_split(df_full_train, test_size=0.25, random_state=42)
print(dftr.shape, dfvl.shape, dfts.shape)

(7148, 10) (2383, 10) (2383, 10)


In [83]:
def my_mutual_info_score(series):
    return f"{mutual_info_score(series, dftr.above_average):.2f}"

dftr[['make', 'model', 'transmission_type', 'vehicle_style']].apply(
    my_mutual_info_score
).sort_values(ascending=False)

model                0.46
make                 0.24
vehicle_style        0.08
transmission_type    0.02
dtype: object

## Q4

In [84]:
dv = DictVectorizer(sparse=False)

train_dicts = dftr.drop("above_average", axis=1).to_dict(orient="records")
xtr = dv.fit_transform(train_dicts)

val_dicts = dfvl.drop("above_average", axis=1).to_dict(orient="records")
xvl = dv.transform(val_dicts)

ytr = dftr.above_average.values
yvl = dfvl.above_average.values

print(xtr.shape, ytr.shape, xvl.shape, yvl.shape)

(7148, 943) (7148,) (2383, 943) (2383,) (2383, 943) (2383,)


In [93]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(xtr, ytr)
print(f"{accuracy_score(yvl, model.predict(xvl)):.2f}")

0.93


## Q5

In [98]:
feats = ['year', 'engine_hp', 'transmission_type', 'city_mpg']
stats = {}

for f in feats:
    dv = DictVectorizer(sparse=False)

    train_dicts = dftr.drop(["above_average", f], axis=1).to_dict(orient="records")
    xtr = dv.fit_transform(train_dicts)

    val_dicts = dfvl.drop(["above_average", f], axis=1).to_dict(orient="records")
    xvl = dv.transform(val_dicts)
    model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
    model.fit(xtr, ytr)
    stats[f] = f"{model.score(xvl, yvl):.2f}"


print(stats)

{'year': '0.95', 'engine_hp': '0.93', 'transmission_type': '0.95', 'city_mpg': '0.95'}


## Q6

In [107]:
df.price = df.price.apply(lambda x: np.log(x))

df_full_train, dfts = train_test_split(df, test_size=0.2, random_state=42)
dftr, dfvl = train_test_split(df_full_train, test_size=0.25, random_state=42)
print(dftr.shape, dfvl.shape, dfts.shape)

dv = DictVectorizer(sparse=False)

train_dicts = dftr.drop("price", axis=1).to_dict(orient="records")
xtr = dv.fit_transform(train_dicts)

val_dicts = dfvl.drop("price", axis=1).to_dict(orient="records")
xvl = dv.transform(val_dicts)

ytr = dftr.price.values
yvl = dfvl.price.values

print(xtr.shape, ytr.shape, xvl.shape, yvl.shape)

stats = {}
alphas = [0, 0.01, 0.1, 1, 10]
for alph in alphas:
    ridge = Ridge(solver="sag", alpha=alph)
    ridge.fit(xtr, ytr)
    rmse = mean_squared_error(yvl, ridge.predict(xvl))
    print(alph, rmse)
    stats[alph] = f"{rmse:.3f}"

print(stats)
    

(7148, 10) (2383, 10) (2383, 10)
(7148, 943) (7148,) (2383, 943) (2383,)




0 0.002751007907400734




0.01 0.002750970692017939




0.1 0.002751015207291067




1 0.002751240195538074
10 0.0027534049228147856
{0: '0.003', 0.01: '0.003', 0.1: '0.003', 1: '0.003', 10: '0.003'}


