In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('data.csv')
data = data[["Make",
"Model",
"Year",
"Engine HP",
"Engine Cylinders",
"Transmission Type",
"Vehicle Style",
"highway MPG",
"city mpg","MSRP"]]

### Data preparation

In [3]:
data.columns = data.columns.str.replace(' ', '_').str.lower()

In [4]:
data.isna().sum()

make                  0
model                 0
year                  0
engine_hp            69
engine_cylinders     30
transmission_type     0
vehicle_style         0
highway_mpg           0
city_mpg              0
msrp                  0
dtype: int64

In [5]:
data.engine_hp.fillna(0,inplace=True)
data.engine_cylinders.fillna(0,inplace=True)

In [6]:
data.rename(columns={'msrp':'price'},inplace=True)

**Question 1**
- `What is the most frequent observation (mode) for the column transmission_type?`

In [7]:
data.transmission_type.value_counts()

AUTOMATIC           8266
MANUAL              2935
AUTOMATED_MANUAL     626
DIRECT_DRIVE          68
UNKNOWN               19
Name: transmission_type, dtype: int64

The most frequent observation is Automatic

**Question 2**<br/>
Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset. What are the two features that have the biggest correlation in this dataset?
- `engine_hp` and `year`
- `engine_hp` and `engine_cylinders`
- `highway_mpg` and `engine_cylinders`
- `highway_mpg` and `city_mpg`

In [8]:
import numpy as np
data[list(data.select_dtypes([np.number]).columns)].corr()

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg,price
year,1.0,0.338714,-0.040708,0.25824,0.198171,0.22759
engine_hp,0.338714,1.0,0.774851,-0.415707,-0.424918,0.650095
engine_cylinders,-0.040708,0.774851,1.0,-0.614541,-0.587306,0.526274
highway_mpg,0.25824,-0.415707,-0.614541,1.0,0.886829,-0.160043
city_mpg,0.198171,-0.424918,-0.587306,0.886829,1.0,-0.157676
price,0.22759,0.650095,0.526274,-0.160043,-0.157676,1.0


`highway_mpg` and `city_mpg` have the biggest correlation in the dataset

**Make price binary**
- Now we need to turn the price variable from numeric into a binary format.
- Let's create a variable above_average which is 1 if the price is above its mean value and 0 otherwise.

In [9]:
mean_price = data['price'].mean()
data['above_average'] = np.where(data['price'] > mean_price,1,0)

**Split the data**
- Split your data in train/val/test sets with 60%/20%/20% distribution.
- Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
- Make sure that the target value (price) is not in your dataframe.

In [10]:
data_backup = data.copy()
data.drop('price',axis=1,inplace=True)

In [11]:
X = data.iloc[:,:-1]
y = data.iloc[:,-1]

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

**Question 3**
- Calculate the mutual information score between above_average and other categorical variables in our dataset. Use the training set only.
- Round the scores to 2 decimals using round(score, 2).<br/>

`Which of these variables has the lowest mutual information score?`

In [14]:
cat_columns = ["make"
,"model"
,"transmission_type"
,"vehicle_style"]

In [15]:
import sys
from sklearn.metrics import mutual_info_score
minimum = sys.maxsize
index = None
mutual_scores=[]
for i,u in enumerate(cat_columns):
    mutual_score = round(mutual_info_score(X_train.loc[:,u],y_train),2)
    mutual_scores.append(mutual_score)
    if minimum > mutual_score:
        minimum = mutual_score
        index=i  

In [16]:
mutual_scores

[0.24, 0.46, 0.02, 0.08]

In [16]:
minimum,cat_columns[index]

(0.02, 'transmission_type')

`transmission_type` has the lowest mutual information score

**Question 4**
- Now let's train a logistic regression.
- Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
- Fit the model on the training dataset.
  - To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
  - model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
- Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

In [17]:
from sklearn.feature_extraction import DictVectorizer
train_dict = X_train[list(X_train.columns)].to_dict(orient='records')
#train_dict

In [18]:
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

In [19]:
train_x = dv.transform(train_dict)

In [20]:
dv.get_feature_names_out()

array(['city_mpg', 'engine_cylinders', 'engine_hp', 'highway_mpg',
       'make=Acura', 'make=Alfa Romeo', 'make=Aston Martin', 'make=Audi',
       'make=BMW', 'make=Bentley', 'make=Bugatti', 'make=Buick',
       'make=Cadillac', 'make=Chevrolet', 'make=Chrysler', 'make=Dodge',
       'make=FIAT', 'make=Ferrari', 'make=Ford', 'make=GMC',
       'make=Genesis', 'make=HUMMER', 'make=Honda', 'make=Hyundai',
       'make=Infiniti', 'make=Kia', 'make=Lamborghini', 'make=Land Rover',
       'make=Lexus', 'make=Lincoln', 'make=Lotus', 'make=Maserati',
       'make=Maybach', 'make=Mazda', 'make=McLaren', 'make=Mercedes-Benz',
       'make=Mitsubishi', 'make=Nissan', 'make=Oldsmobile',
       'make=Plymouth', 'make=Pontiac', 'make=Porsche',
       'make=Rolls-Royce', 'make=Saab', 'make=Scion', 'make=Spyker',
       'make=Subaru', 'make=Suzuki', 'make=Tesla', 'make=Toyota',
       'make=Volkswagen', 'make=Volvo', 'model=1 Series', 'model=100',
       'model=124 Spider', 'model=190-Class', 'model

In [21]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(train_x, y_train)

In [22]:
val_dict = X_val[list(X_val.columns)].to_dict(orient='records')
val_x = dv.transform(val_dict)

In [23]:
y_pred_val = model.predict(val_x)

In [24]:
from sklearn.metrics import accuracy_score
round(accuracy_score(y_val,y_pred_val),2)

0.94

Accuracy is coming close to ~0.94 closest option is 0.95

**Question 5**
- Let's find the least useful feature using the feature elimination technique.
- Train a model with all these features (using the same parameters as in Q4).
- Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
- For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
<br/><br/>Which of following feature has the smallest difference?<br/><br/>

- `year`
- `engine_hp`
- `transmission_type`
- `city_mpg`

In [25]:
def train_val_pipeline(feature_to_be_removed):
    train_dict = X_train[list(set(list(X_train.columns))-set([feature_to_be_removed]))].to_dict(orient='records')
    dv = DictVectorizer(sparse=False)
    dv.fit(train_dict)
    train_x = dv.transform(train_dict)
    model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
    model.fit(train_x, y_train)
    val_dict = X_val[list(set(list(X_val.columns))-set([feature_to_be_removed]))].to_dict(orient='records')
    val_x = dv.transform(val_dict)
    y_pred_val = model.predict(val_x)
    return abs(0.94-round(accuracy_score(y_val,y_pred_val),2))

In [26]:
features_to_be_removed = ['year','engine_hp','transmission_type','city_mpg']
for feature_to_be_removed in features_to_be_removed:
    print(feature_to_be_removed)
    print(train_val_pipeline(feature_to_be_removed))

year
0.010000000000000009
engine_hp
0.009999999999999898
transmission_type
0.010000000000000009
city_mpg
0.0


`city_mpg` has the smallest difference of 0.0

**Question 6**
- For this question, we'll see how to use a linear regression model from Scikit-Learn.
- We'll need to use the original column price. Apply the logarithmic transformation to this column.
- Fit the Ridge regression model on the training data with a solver 'sag'. Set the seed to 42.
- This model also has a parameter alpha. Let's try the following values: [0, 0.01, 0.1, 1, 10].
- Round your RMSE scores to 3 decimal digits.
<br/><br/>Which of these alphas leads to the best RMSE on the validation set?<br/><br/>

- 0
- 0.01
- 0.1
- 1
- 10

In [30]:
X = data_backup.iloc[:,:-2]
y = data_backup.iloc[:,-2]

0        46135
1        40650
2        36350
3        29450
4        34500
         ...  
11909    46120
11910    56670
11911    50620
11912    50920
11913    28995
Name: price, Length: 11914, dtype: int64

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
y_train = np.log1p(y_train)
y_val = np.log1p(y_val)
y_test = np.log1p(y_test)

In [36]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
alphas = [0,0.01,0.1,1,10]
minimum = sys.maxsize
rmses=[]
index = None
for i,alpha in enumerate(alphas):
    train_dict = X_train[list(X_train.columns)].to_dict(orient='records')
    dv = DictVectorizer(sparse=False)
    dv.fit(train_dict)
    train_x = dv.transform(train_dict)
    ridge_model = Ridge(alpha=alpha,solver='sag',random_state=42)
    ridge_model.fit(train_x,y_train)
    val_dict = X_val[list(X_val.columns)].to_dict(orient='records')
    val_x = dv.transform(val_dict)
    y_pred_val = ridge_model.predict(val_x)
    rmse = round(mean_squared_error(y_val,y_pred_val,squared=False),3)
    rmses.append(rmse)
    if minimum > rmse:
        minimum = rmse
        index = i
print(alphas[i])



10




In [37]:
rmses

[0.483, 0.483, 0.483, 0.483, 0.483]

In [35]:
minimum

0.483

Alpha value 0 leads to the best RMSE on the validation set