In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("car_fuel_efficiency.csv")
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


1. Fill missing values with zeros.
2. Do train/validation/test split with 60%/20%/20% distribution.
3. Use the train_test_split function and set the random_state parameter to 1.
4. Use DictVectorizer(sparse=True) to turn the dataframes into matrices.

In [6]:
df = df.fillna(0)
df.isna().sum()

engine_displacement    0
num_cylinders          0
horsepower             0
vehicle_weight         0
acceleration           0
model_year             0
origin                 0
fuel_type              0
drivetrain             0
num_doors              0
fuel_efficiency_mpg    0
dtype: int64

In [15]:
from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = (df_train.fuel_efficiency_mpg).values
y_val = (df_val.fuel_efficiency_mpg).values
y_test = (df_test.fuel_efficiency_mpg).values

del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']

In [16]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=True)

train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient='records')
test_dict = df_test.to_dict(orient='records')

X_val = dv.transform(val_dict)
X_test = dv.transform(test_dict)


In [20]:
# Let's train a decision tree regressor to predict the fuel_efficiency_mpg variable.
# Train a model with max_depth=1.
# Which feature is used for splitting the data?
from sklearn.tree import export_text
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score

dt = DecisionTreeRegressor(max_depth=1)
dt.fit(X_train, y_train)

val_dicts = df_val.fillna(0).to_dict(orient='records')
X_val = dv.transform(val_dicts)

feature_names = dv.get_feature_names_out()
root_feature_idx = dt.tree_.feature[0]
root_threshold   = dt.tree_.threshold[0]

print(export_text(dt, feature_names=list(feature_names)))

|--- vehicle_weight <= 3022.11
|   |--- value: [16.88]
|--- vehicle_weight >  3022.11
|   |--- value: [12.94]



In [25]:
# Train a random forest regressor with these parameters:
# n_estimators=10
# random_state=1
# n_jobs=-1 (optional - to make training faster)
# What's the RMSE of this model on the validation data?
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print("Validation RMSE:", rmse)

Validation RMSE: 0.4586615458484907


In [27]:
# Now let's experiment with the n_estimators parameter
# Try different values of this parameter from 10 to 200 with step 10.
# Set random_state to 1.
# Evaluate the model on the validation dataset.
# After which value of n_estimators does RMSE stop improving? Consider 3 decimal places for calculating the answer.

rsme_list = []

for n in range(10, 210, 10):
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)
    rmse = round(np.sqrt(mean_squared_error(y_val, y_pred)), 3)
    rsme_list.append((n, rmse))

print(sorted(rsme_list, key=lambda x: x[1]))

[(150, np.float64(0.443)), (160, np.float64(0.443)), (170, np.float64(0.443)), (180, np.float64(0.443)), (190, np.float64(0.443)), (200, np.float64(0.443)), (110, np.float64(0.444)), (120, np.float64(0.444)), (130, np.float64(0.444)), (140, np.float64(0.444)), (60, np.float64(0.445)), (70, np.float64(0.445)), (80, np.float64(0.445)), (90, np.float64(0.445)), (100, np.float64(0.445)), (50, np.float64(0.446)), (40, np.float64(0.448)), (30, np.float64(0.451)), (20, np.float64(0.454)), (10, np.float64(0.459))]


In [28]:
# Let's select the best max_depth:
# Try different values of max_depth: [10, 15, 20, 25]
# For each of these values,
# try different values of n_estimators from 10 till 200 (with step 10)
# calculate the mean RMSE
# Fix the random seed: random_state=1
# What's the best max_depth, using the mean RMSE?

mean_list = []

for depth in [10, 15, 20, 25]:
    rmse_list = []
    for n in range(10, 210, 10):
        rf = RandomForestRegressor(n_estimators=n, max_depth=depth, random_state=1, n_jobs=-1)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)
        rmse = round(np.sqrt(mean_squared_error(y_val, y_pred)), 3)
        rmse_list.append((n, rmse))
    mean_rmse = round(np.mean(rmse_list), 3)
    mean_list.append((depth, mean_rmse))

best_depth, best_rmse = min(mean_list, key=lambda x: x[1])
print(f"Best max_depth = {best_depth}, mean RMSE = {best_rmse}")

Best max_depth = 10, mean RMSE = 52.721


In [29]:
# We can extract feature importance information from tree-based models.
# At each step of the decision tree learning algorithm, it finds the best split. 
# When doing it, we can calculate "gain" - the reduction in impurity before and after the split. 
# This gain is quite useful in understanding what are the important features for tree-based models.
# In Scikit-Learn, tree-based models contain this information in the feature_importances_ field.
# For this homework question, we'll find the most important feature:
# Train the model with these parameters:
# n_estimators=10,
# max_depth=20,
# random_state=1,
# n_jobs=-1 (optional)
# Get the feature importance information from this model
# What's the most important feature (among these 4)?

rf = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

importances = rf.feature_importances_

feature_names = dv.get_feature_names_out()
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
})
importance_df = importance_df.sort_values(by='importance', ascending=False)

print(importance_df.head(10))

                feature  importance
13       vehicle_weight    0.959153
6            horsepower    0.016066
0          acceleration    0.011490
3   engine_displacement    0.003279
7            model_year    0.003170
8         num_cylinders    0.002333
9             num_doors    0.001618
12           origin=USA    0.000546
11        origin=Europe    0.000513
10          origin=Asia    0.000454


Now let's train an XGBoost model! For this question, we'll tune the eta parameter:
- Install XGBoost
- Create DMatrix for train and validation
- Create a watchlist
- Train a model with these parameters for 100 rounds:


Now change eta from 0.3 to 0.1.
Which eta leads to the best RMSE score on the validation dataset?

In [31]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.1.1-py3-none-macosx_12_0_arm64.whl.metadata (2.1 kB)
Downloading xgboost-3.1.1-py3-none-macosx_12_0_arm64.whl (2.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-3.1.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [34]:
!pip freeze

anyio==4.10.0
appnope==0.1.4
argon2-cffi==25.1.0
argon2-cffi-bindings==25.1.0
arrow==1.3.0
asttokens==3.0.0
async-lru==2.0.5
attrs==25.3.0
babel==2.17.0
beautifulsoup4==4.13.5
bleach==6.2.0
blinker==1.9.0
certifi==2025.8.3
cffi==2.0.0
charset-normalizer==3.4.3
click==8.3.0
comm==0.2.3
contourpy==1.3.3
cycler==0.12.1
debugpy==1.8.16
decorator==5.2.1
defusedxml==0.7.1
executing==2.2.1
fastjsonschema==2.21.2
Flask==3.1.2
fonttools==4.60.0
fqdn==1.5.1
gunicorn==23.0.0
h11==0.16.0
httpcore==1.0.9
httpx==0.28.1
idna==3.10
ipykernel==6.30.1
ipython==9.5.0
ipython_pygments_lexers==1.1.1
ipywidgets==8.1.7
isoduration==20.11.0
itsdangerous==2.2.0
jedi==0.19.2
Jinja2==3.1.6
joblib==1.5.2
json5==0.12.1
jsonpointer==3.0.0
jsonschema==4.25.1
jsonschema-specifications==2025.9.1
jupyter==1.1.1
jupyter-console==6.6.3
jupyter-events==0.12.0
jupyter-lsp==2.3.0
jupyter_client==8.6.3
jupyter_core==5.8.1
jupyter_server==2.17.0
jupyter_server_terminals==0.5.3
jupyterlab==4.4.7
jupyterlab_pygments==0.3.0
jupy