In [8]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import accuracy_score,auc,roc_auc_score,root_mean_squared_error
from sklearn.tree import export_text
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline

Dataset
In this homework, we continue using the fuel efficiency dataset. Download it from here.

You can do it with wget:

wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv
The goal of this homework is to create a regression model for predicting the car fuel efficiency (column 'fuel_efficiency_mpg').

Preparing the dataset
Preparation:



In [18]:
df = pd.read_csv("car_fuel_efficiency.csv")
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


checking for missing values 

In [20]:
#Checking for missing values
df.isnull().sum()

engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64

Fill missing values with zeros.

In [22]:
df = df.fillna(0)

In [26]:
# 3. Split into train/validation/test (60% / 20% / 20%)
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)  # 0.25 * 0.8 = 0.2

In [28]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [30]:
# 4. Initialize DictVectorizer
dv = DictVectorizer(sparse=True)

In [32]:
# 5. Convert dataframes into dictionaries
train_dicts = df_train.to_dict(orient="records")
val_dicts = df_val.to_dict(orient="records")
test_dicts = df_test.to_dict(orient="records")

In [34]:
# 6. Fit DictVectorizer on train set and transform all
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)
X_test = dv.transform(test_dicts)

In [40]:
# 7. Display results
print("Shapes:")
print("X_train:", X_train.shape)
print("X_val:", X_val.shape)
print("X_test:", X_test.shape)

Shapes:
X_train: (5822, 15)
X_val: (1941, 15)
X_test: (1941, 15)


Question 1
Let's train a decision tree regressor to predict the fuel_efficiency_mpg variable.

Train a model with max_depth=1.
Which feature is used for splitting the data?

'vehicle_weight'
'model_year'
'origin'
'fuel_type'

In [46]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor

# 8. Identify numerical and categorical columns
num_cols = df_train.select_dtypes(include=["int64", "float64"]).columns.drop("fuel_efficiency_mpg")
cat_cols = df_train.select_dtypes(include=["object"]).columns

# 9. Define preprocessing pipeline
preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ("num", "passthrough", num_cols)
])

# 10. Build pipeline with Decision Tree Regressor
dt = Pipeline([
    ("preprocessor", preprocessor),
    ("model", DecisionTreeRegressor(max_depth=1, random_state=1))
])

# 11. Train model
X_train_df = df_train.drop(columns=["fuel_efficiency_mpg"])
y_train = df_train["fuel_efficiency_mpg"]
dt.fit(X_train_df, y_train)

# 12. Extract feature used for the split
tree = dt.named_steps["model"]
feature_names = dt.named_steps["preprocessor"].get_feature_names_out()
split_feature = feature_names[tree.tree_.feature[0]]

print("Feature used for splitting:", split_feature)

Feature used for splitting: num__vehicle_weight


Question 2
Train a random forest regressor with these parameters:

n_estimators=10
random_state=1
n_jobs=-1 (optional - to make training faster)
What's the RMSE of this model on the validation data?

0.045
0.45
4.5
45.0

In [48]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
# Reset indices
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)

# . Identify numerical and categorical columns
target = "fuel_efficiency_mpg"
num_cols = df_train.select_dtypes(include=["int64", "float64"]).columns.drop(target)
cat_cols = df_train.select_dtypes(include=["object"]).columns

# . Define preprocessing pipeline
preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ("num", "passthrough", num_cols)
])

# . Create Random Forest pipeline
rf = Pipeline([
    ("preprocessor", preprocessor),
    ("model", RandomForestRegressor(
        n_estimators=10,
        random_state=1,
        n_jobs=-1
    ))
])

# . Train the model
X_train = df_train.drop(columns=[target])
y_train = df_train[target]
rf.fit(X_train, y_train)

# . Evaluate on validation set
X_val = df_val.drop(columns=[target])
y_val = df_val[target]
y_pred = rf.predict(X_val)

# . Compute RMSE
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print("RMSE:", round(rmse, 3))

RMSE: 0.459


Now let's experiment with the n_estimators parameter

Try different values of this parameter from 10 to 200 with step 10.
Set random_state to 1.
Evaluate the model on the validation dataset.
After which value of n_estimators does RMSE stop improving? Consider 3 decimal places for calculating the answer.

10
25
80
200
If it doesn't stop improving, use the latest iteration number in your answer.

In [50]:
# Prepare training/validation data
X_train = df_train.drop(columns=[target])
y_train = df_train[target]
X_val = df_val.drop(columns=[target])
y_val = df_val[target]

#  Try different n_estimators values
rmse_scores = {}

for n in range(10, 201, 10):
    rf = Pipeline([
        ("preprocessor", preprocessor),
        ("model", RandomForestRegressor(
            n_estimators=n,
            random_state=1,
            n_jobs=-1
        ))
    ])
    
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    rmse_scores[n] = round(rmse, 3)
    print(f"n_estimators={n:3d} => RMSE: {rmse_scores[n]}")

# Find point where RMSE stops improving
prev_rmse = None
best_n = None
for n, rmse in rmse_scores.items():
    if prev_rmse is not None and rmse >= prev_rmse:
        best_n = n
        break
    prev_rmse = rmse

if best_n:
    print(f"\nRMSE stops improving after n_estimators = {best_n}")
else:
    print("\nRMSE keeps improving up to n_estimators = 200")

n_estimators= 10 => RMSE: 0.459
n_estimators= 20 => RMSE: 0.453
n_estimators= 30 => RMSE: 0.452
n_estimators= 40 => RMSE: 0.448
n_estimators= 50 => RMSE: 0.446
n_estimators= 60 => RMSE: 0.446
n_estimators= 70 => RMSE: 0.445
n_estimators= 80 => RMSE: 0.446
n_estimators= 90 => RMSE: 0.446
n_estimators=100 => RMSE: 0.445
n_estimators=110 => RMSE: 0.444
n_estimators=120 => RMSE: 0.445
n_estimators=130 => RMSE: 0.444
n_estimators=140 => RMSE: 0.444
n_estimators=150 => RMSE: 0.444
n_estimators=160 => RMSE: 0.444
n_estimators=170 => RMSE: 0.443
n_estimators=180 => RMSE: 0.443
n_estimators=190 => RMSE: 0.443
n_estimators=200 => RMSE: 0.443

RMSE stops improving after n_estimators = 60


Question 4
Let's select the best max_depth:

Try different values of max_depth: [10, 15, 20, 25]
For each of these values,
try different values of n_estimators from 10 till 200 (with step 10)
calculate the mean RMSE
Fix the random seed: random_state=1
What's the best max_depth, using the mean RMSE?

10
15
20
25

In [52]:
# Grid search for best max_depth and n_estimators
depth_values = [10, 15, 20, 25]
n_values = range(10, 201, 10)
results = {}

for depth in depth_values:
    rmse_list = []
    for n in n_values:
        rf = Pipeline([
            ("preprocessor", preprocessor),
            ("model", RandomForestRegressor(
                n_estimators=n,
                max_depth=depth,
                random_state=1,
                n_jobs=-1
            ))
        ])
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        rmse_list.append(rmse)
    
    mean_rmse = np.mean(rmse_list)
    results[depth] = round(mean_rmse, 3)
    print(f"max_depth={depth} => mean RMSE: {mean_rmse:.3f}")

# Find best max_depth
best_depth = min(results, key=results.get)
print("\nBest max_depth:", best_depth)

max_depth=10 => mean RMSE: 0.442
max_depth=15 => mean RMSE: 0.446
max_depth=20 => mean RMSE: 0.446
max_depth=25 => mean RMSE: 0.446

Best max_depth: 10


Question 5
We can extract feature importance information from tree-based models.

At each step of the decision tree learning algorithm, it finds the best split. When doing it, we can calculate "gain" - the reduction in impurity before and after the split. This gain is quite useful in understanding what are the important features for tree-based models.

In Scikit-Learn, tree-based models contain this information in the feature_importances_ field.

For this homework question, we'll find the most important feature:

Train the model with these parameters:
n_estimators=10,
max_depth=20,
random_state=1,
n_jobs=-1 (optional)
Get the feature importance information from this model
What's the most important feature (among these 4)?

vehicle_weight
horsepower
acceleration
engine_displacement

In [54]:
# 7. Transform training data
X_train_prepared = preprocessor.fit_transform(X_train)
feature_names = preprocessor.get_feature_names_out()

# 8. Train Random Forest model
rf = RandomForestRegressor(
    n_estimators=10,
    max_depth=20,
    random_state=1,
    n_jobs=-1
)
rf.fit(X_train_prepared, y_train)

# 9. Extract feature importances
importances = rf.feature_importances_
feature_importance = pd.Series(importances, index=feature_names).sort_values(ascending=False)

# 10. Display top features
print(feature_importance.head(10))

# 11. Focus on the given 4 features
important_subset = feature_importance[
    feature_importance.index.str.contains("vehicle_weight|horsepower|acceleration|engine_displacement")
]
print("\nFeature importance (selected features):")
print(important_subset)

num__vehicle_weight         0.959189
num__horsepower             0.016063
num__acceleration           0.011412
num__engine_displacement    0.003276
num__model_year             0.003246
num__num_cylinders          0.002362
num__num_doors              0.001572
cat__origin_USA             0.000529
cat__origin_Europe          0.000504
cat__origin_Asia            0.000457
dtype: float64

Feature importance (selected features):
num__vehicle_weight         0.959189
num__horsepower             0.016063
num__acceleration           0.011412
num__engine_displacement    0.003276
dtype: float64


6.Now let's train an XGBoost model! For this question, we'll tune the eta parameter:

Install XGBoost
Create DMatrix for train and validation
Create a watchlist
Train a model with these parameters for 100 rounds:

In [69]:
!pip install xgboost


Collecting xgboost
  Downloading xgboost-3.1.1-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.1.1-py3-none-win_amd64.whl (72.0 MB)
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/72.0 MB 108.9 kB/s eta 0:11:01
   ---------------------------------------- 0.0/72.0 MB 131.3 kB/s eta 0:09:09
   ---------------------------------------- 0.0/72.0 MB 140.3 kB/s eta 0:08:33
   ---------------------------------------- 0.1/72.0 MB 231.0 kB/s eta 0:05:12
   ---------------------------------------- 0.2/72.0 MB 437.1 kB/s eta 0:02:45
   ---------------------------------------- 0.2/72.0 MB 600.7 kB/s eta 0:02:00
   --------------------------

In [70]:
import xgboost as xgb
print("XGBoost version:", xgb.__version__)


XGBoost version: 3.1.1


In [76]:
# Create DMatrix objects for XGBoost
dtrain = xgb.DMatrix(X_train_prepared, label=y_train)
dval = xgb.DMatrix(X_val_prepared, label=y_val)

# 8. Create watchlist
watchlist = [(dtrain, "train"), (dval, "val")]

# Define a function to train and evaluate for different eta values
def evaluate_eta(eta_value):
    params = {
        'eta': eta_value,
        'max_depth': 6,
        'min_child_weight': 1,
        'objective': 'reg:squarederror',
        'nthread': 8,
        'seed': 1,
        'verbosity': 1
    }
    
    model = xgb.train(
        params=params,
        dtrain=dtrain,
        num_boost_round=100,
        evals=watchlist,
        verbose_eval=False  # suppress training logs
    )
    
    y_pred = model.predict(dval)
    rmse = np.sqrt(((y_val - y_pred) ** 2).mean())
    return rmse

# Compare RMSE for eta = 0.3 and 0.1
rmse_03 = evaluate_eta(0.3)
rmse_01 = evaluate_eta(0.1)

print(f"RMSE for eta=0.3: {rmse_03:.3f}")
print(f"RMSE for eta=0.1: {rmse_01:.3f}")

# 11. Determine which eta performs better
if abs(rmse_03 - rmse_01) < 1e-3:
    print("\n✅ Both give equal value")
elif rmse_03 < rmse_01:
    print("\n✅ eta=0.3 gives better RMSE")
else:
    print("\n✅ eta=0.1 gives better RMSE")

RMSE for eta=0.3: 0.449
RMSE for eta=0.1: 0.427

✅ eta=0.1 gives better RMSE
