In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
import xgboost as xgb

In [33]:
df = pd.read_csv('../data/car_fuel_efficiency.csv')


In [34]:
# Preparing the dataset
# Preparation:

# Fill missing values with zeros.
# Do train/validation/test split with 60%/20%/20% distribution.
# Use the train_test_split function and set the random_state parameter to 1.
# Use DictVectorizer(sparse=True) to turn the dataframes into matrices.
df = df.fillna(0)
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)  # 0.25 x 0.8 = 0.2
y_train = df_train.fuel_efficiency_mpg.values
y_val = df_val.fuel_efficiency_mpg.values
y_test = df_test.fuel_efficiency_mpg.values
del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']

dv = DictVectorizer(sparse=True)
train_dicts = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)
val_dicts = df_val.to_dict(orient='records')
X_val = dv.transform(val_dicts)
test_dicts = df_test.to_dict(orient='records')
X_test = dv.transform(test_dicts)
print(X_train.shape, X_val.shape, X_test.shape)
print(y_train.mean(), y_val.mean(), y_test.mean())

(5822, 14) (1941, 14) (1941, 14)
14.993604027528097 14.95391385579406 14.991494510134533


In [35]:
# Question 1
# Let's train a decision tree regressor to predict the fuel_efficiency_mpg variable.

# Train a model with max_depth=1.
# Which feature is used for splitting the data?


dt = DecisionTreeRegressor(max_depth=1)
dt.fit(X_train, y_train)

# Get feature names from DictVectorizer and find the most important feature
feature_names = dv.get_feature_names_out()
most_important_feature = feature_names[dt.feature_importances_.argmax()]
print(f"Most important feature: {most_important_feature}")
print(f"Feature importance: {dt.feature_importances_.max():.4f}")

Most important feature: vehicle_weight
Feature importance: 1.0000


In [36]:
# Question 2
# Train a random forest regressor with these parameters:

# n_estimators=10
# random_state=1
# n_jobs=-1 (optional - to make training faster)
# What's the RMSE of this model on the validation data?

rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)
y_val_pred = rf.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
print(f"Validation RMSE: {rmse:.2f}")

Validation RMSE: 0.46


In [37]:
# Question 3
# Now let's experiment with the n_estimators parameter

# Try different values of this parameter from 10 to 200 with step 10.
# Set random_state to 1.
# Evaluate the model on the validation dataset.
# After which value of n_estimators does RMSE stop improving?
# Consider 3 decimal places for calculating the answer.


best_rmse = float('inf')
best_n_estimators = 0
for n_estimators in range(10, 201, 10):
    rf = RandomForestRegressor(n_estimators=n_estimators, random_state=1, n_jobs=-1)
    rf.fit(X_train, y_train)
    y_val_pred = rf.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
    print(f"n_estimators: {n_estimators}, Validation RMSE: {rmse:.3f}")
    if rmse < best_rmse:
        best_rmse = rmse
        best_n_estimators = n_estimators
    else:
        break
print(f"Best n_estimators: {best_n_estimators} with RMSE: {best_rmse:.3f}")





n_estimators: 10, Validation RMSE: 0.460
n_estimators: 20, Validation RMSE: 0.454
n_estimators: 20, Validation RMSE: 0.454
n_estimators: 30, Validation RMSE: 0.452
n_estimators: 30, Validation RMSE: 0.452
n_estimators: 40, Validation RMSE: 0.449
n_estimators: 40, Validation RMSE: 0.449
n_estimators: 50, Validation RMSE: 0.447
n_estimators: 50, Validation RMSE: 0.447
n_estimators: 60, Validation RMSE: 0.445
n_estimators: 60, Validation RMSE: 0.445
n_estimators: 70, Validation RMSE: 0.445
n_estimators: 70, Validation RMSE: 0.445
n_estimators: 80, Validation RMSE: 0.445
n_estimators: 80, Validation RMSE: 0.445
n_estimators: 90, Validation RMSE: 0.445
n_estimators: 90, Validation RMSE: 0.445
n_estimators: 100, Validation RMSE: 0.445
n_estimators: 100, Validation RMSE: 0.445
n_estimators: 110, Validation RMSE: 0.444
n_estimators: 110, Validation RMSE: 0.444
n_estimators: 120, Validation RMSE: 0.444
Best n_estimators: 110 with RMSE: 0.444
n_estimators: 120, Validation RMSE: 0.444
Best n_esti

In [None]:
# Question 4
# Let's select the best max_depth:

# Try different values of max_depth: [10, 15, 20, 25]
# For each of these values,
# try different values of n_estimators from 10 till 200 (with step 10)
# calculate the mean RMSE
# Fix the random seed: random_state=1
# What's the best max_depth, using the mean RMSE?


best_overall_rmse = float('inf')
best_overall_max_depth = 0
for max_depth in [10, 15, 20, 25]:
    for n_estimators in range(10, 201, 10):
        rf = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=1, n_jobs=-1)
        rf.fit(X_train, y_train)
        y_val_pred = rf.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
        if rmse < best_overall_rmse:
            best_overall_rmse = rmse
            best_overall_max_depth = max_depth
print(f"Best max_depth: {best_overall_max_depth} with RMSE: {best_overall_rmse:.3f}")

In [38]:
# Question 5
# We can extract feature importance information from tree-based models.

# At each step of the decision tree learning algorithm, it finds the best split.
# When doing it, we can calculate "gain" - the reduction in impurity before and after the split.
# This gain is quite useful in understanding what are the important features for tree-based models.

# In Scikit-Learn, tree-based models contain this information in the
# feature_importances_
# field.

# For this homework question, we'll find the most important feature:

# Train the model with these parameters:
# n_estimators=10,
# max_depth=20,
# random_state=1,
# n_jobs=-1 (optional)
# Get the feature importance information from this model
# What's the most important feature (among these 4)?



rf = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)
feature_names = dv.get_feature_names_out()
importances = rf.feature_importances_
most_important_feature = feature_names[importances.argmax()]
print(f"Most important feature: {most_important_feature}")


Most important feature: vehicle_weight


In [39]:
# Question 6
# Now let's train an XGBoost model! For this question, we'll tune the eta parameter:

# Install XGBoost
# Create DMatrix for train and validation
# Create a watchlist
# Train a model with these parameters for 100 rounds:
# xgb_params = {
#     'eta': 0.3, 
#     'max_depth': 6,
#     'min_child_weight': 1,
    
#     'objective': 'reg:squarederror',
#     'nthread': 8,
    
#     'seed': 1,
#     'verbosity': 1,
# }
# Now change eta from 0.3 to 0.1.

# Which eta leads to the best RMSE score on the validation dataset?

# 0.3
# 0.1
# Both give equal value


dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
watchlist = [(dtrain, 'train'), (dval, 'val')]
for eta in [0.3, 0.1]:
    xgb_params = {
        'eta': eta,
        'max_depth': 6,
        'min_child_weight': 1,
        'objective': 'reg:squarederror',
        'nthread': 8,
        'seed': 1,
        'verbosity': 1,
    }
    model = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist)
    y_val_pred = model.predict(dval)
    rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
    print(f"eta: {eta}, Validation RMSE: {rmse:.3f}")


[0]	train-rmse:1.81393	val-rmse:1.85444
[1]	train-rmse:1.31919	val-rmse:1.35353
[2]	train-rmse:0.98120	val-rmse:1.01316
[3]	train-rmse:0.75443	val-rmse:0.78667
[4]	train-rmse:0.60680	val-rmse:0.64318
[5]	train-rmse:0.51381	val-rmse:0.55664
[6]	train-rmse:0.45470	val-rmse:0.50321
[1]	train-rmse:1.31919	val-rmse:1.35353
[2]	train-rmse:0.98120	val-rmse:1.01316
[3]	train-rmse:0.75443	val-rmse:0.78667
[4]	train-rmse:0.60680	val-rmse:0.64318
[5]	train-rmse:0.51381	val-rmse:0.55664
[6]	train-rmse:0.45470	val-rmse:0.50321
[7]	train-rmse:0.41881	val-rmse:0.47254
[7]	train-rmse:0.41881	val-rmse:0.47254
[8]	train-rmse:0.39534	val-rmse:0.45509
[9]	train-rmse:0.38038	val-rmse:0.44564
[10]	train-rmse:0.37115	val-rmse:0.43896
[8]	train-rmse:0.39534	val-rmse:0.45509
[9]	train-rmse:0.38038	val-rmse:0.44564
[10]	train-rmse:0.37115	val-rmse:0.43896
[11]	train-rmse:0.36361	val-rmse:0.43594
[12]	train-rmse:0.35850	val-rmse:0.43558
[13]	train-rmse:0.35365	val-rmse:0.43394
[11]	train-rmse:0.36361	val-rmse:0.