In [8]:
!pip install tabulate

Collecting tabulate
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Installing collected packages: tabulate
Successfully installed tabulate-0.9.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xgb

## Load and prepare the dataset
### Preparation:
* Fill missing values with zeros.
* Do train/validation/test split with 60%/20%/20% distribution.
* Use the train_test_split function and set the random_state parameter to 1.
* Use DictVectorizer(sparse=True) to turn the dataframes into matrices.

In [10]:
# Load the dataset
data = "car_fuel_efficiency.csv"
df = pd.read_csv(data)

In [11]:
# --- 1. Fill missing values with zeros ---
df_filled = df.fillna(0)

In [12]:
# Separate features (X) and target (y)
target_column = 'fuel_efficiency_mpg'
y = df_filled[target_column]
X = df_filled.drop(columns=[target_column])

In [13]:
# --- 2. Do train/validation/test split with 60%/20%/20% distribution. ---
# --- 3. Use the train_test_split function and set the random_state parameter to 1. ---

# First split: 80% (train+val) and 20% (test)
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1
)

# Second split: 75% of (train+val) for train (0.75 * 0.8 = 0.6) and 25% for validation (0.25 * 0.8 = 0.2)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.25, random_state=1
)

In [14]:
# --- 4. Use DictVectorizer(sparse=True) to turn the dataframes into matrices. ---

# Convert DataFrames to a list of dictionaries (records)
X_train_dict = X_train.to_dict('records')
X_val_dict = X_val.to_dict('records')
X_test_dict = X_test.to_dict('records')

# Initialize DictVectorizer with sparse=True
dv = DictVectorizer(sparse=True)

# Fit and transform the training data
X_train_matrix = dv.fit_transform(X_train_dict)

# Transform validation and test data
X_val_matrix = dv.transform(X_val_dict)
X_test_matrix = dv.transform(X_test_dict)

# Print shapes and data types for verification
print("Shapes of the resulting matrices:")
print(f"X_train_matrix shape: {X_train_matrix.shape}")
print(f"X_val_matrix shape: {X_val_matrix.shape}")
print(f"X_test_matrix shape: {X_test_matrix.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_val shape: {y_val.shape}")
print(f"y_test shape: {y_test.shape}")

print("\nData type of the matrices (sparse):")
print(f"X_train_matrix type: {type(X_train_matrix)}")
print(f"X_val_matrix type: {type(X_val_matrix)}")
print(f"X_test_matrix type: {type(X_test_matrix)}")

# Also print number of features for sanity check
print(f"\nNumber of features (columns) after DictVectorizer: {len(dv.feature_names_)}")

# Print initial data info to see the missing values and types before filling
print("\nInitial Data Info (before filling NAs):")
df.info()

# Print a few rows of the data after filling with 0
print("\nFirst 5 rows of data after filling NAs with 0:")
print(df_filled.head().to_markdown(index=False, numalign="left", stralign="left"))

Shapes of the resulting matrices:
X_train_matrix shape: (5822, 14)
X_val_matrix shape: (1941, 14)
X_test_matrix shape: (1941, 14)
y_train shape: (5822,)
y_val shape: (1941,)
y_test shape: (1941,)

Data type of the matrices (sparse):
X_train_matrix type: <class 'scipy.sparse._csr.csr_matrix'>
X_val_matrix type: <class 'scipy.sparse._csr.csr_matrix'>
X_test_matrix type: <class 'scipy.sparse._csr.csr_matrix'>

Number of features (columns) after DictVectorizer: 14

Initial Data Info (before filling NAs):
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9704 entries, 0 to 9703
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   engine_displacement  9704 non-null   int64  
 1   num_cylinders        9222 non-null   float64
 2   horsepower           8996 non-null   float64
 3   vehicle_weight       9704 non-null   float64
 4   acceleration         8774 non-null   float64
 5   model_year           9704 non-nu

### 1. Train a decision tree regressor to predict the fuel_efficiency_mpg variable.

In [16]:
# 1. Train a Decision Tree Regressor with max_depth=1
dt_regressor = DecisionTreeRegressor(max_depth=1, random_state=1)
dt_regressor.fit(X_train_matrix, y_train)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,1
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,1
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [17]:
# 2. Extract the feature index used for the split
# The tree_ attribute gives access to the underlying structure.
# feature_[0] gives the index of the feature used at the root (first split).
split_feature_index = dt_regressor.tree_.feature[0]

In [18]:
# 3. Map the feature index back to the feature name
feature_names = dv.feature_names_
split_feature_name = feature_names[split_feature_index]

print(f"Index of the splitting feature: {split_feature_index}")
print(f"Name of the splitting feature: {split_feature_name}")

# Prepare the final answer by checking against the options
options = ['vehicle_weight', 'model_year', 'origin', 'fuel_type']
is_in_options = split_feature_name in options

print(f"Is the feature name in the options? {is_in_options}")
print(f"The feature names mapped by DictVectorizer are: {feature_names}")

Index of the splitting feature: 13
Name of the splitting feature: vehicle_weight
Is the feature name in the options? True
The feature names mapped by DictVectorizer are: ['acceleration', 'drivetrain=All-wheel drive', 'drivetrain=Front-wheel drive', 'engine_displacement', 'fuel_type=Diesel', 'fuel_type=Gasoline', 'horsepower', 'model_year', 'num_cylinders', 'num_doors', 'origin=Asia', 'origin=Europe', 'origin=USA', 'vehicle_weight']


### 2. Train a random forest regressor with these parameters:

* n_estimators=10
* random_state=1
* n_jobs=-1 (optional - to make training faster)

In [20]:
# 1. Train a Random Forest Regressor
rf_regressor = RandomForestRegressor(
    n_estimators=10,
    random_state=1,
    n_jobs=-1  # Use all available cores for faster training
)
rf_regressor.fit(X_train_matrix, y_train)

0,1,2
,n_estimators,10
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [21]:
# 2. Make predictions on the validation data
y_val_pred = rf_regressor.predict(X_val_matrix)

# 3. Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_val, y_val_pred)

# 4. Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)

# Print the results
print(f"Mean Squared Error (MSE) on validation data: {mse:.4f}")
print(f"Root Mean Squared Error (RMSE) on validation data: {rmse:.4f}")

Mean Squared Error (MSE) on validation data: 0.2112
Root Mean Squared Error (RMSE) on validation data: 0.4596


### 3. Experiment with the n_estimators parameter

* Try different values of this parameter from 10 to 200 with step 10.
* Set random_state to 1.
* Evaluate the model on the validation dataset.

In [22]:
# Define the range of n_estimators
n_estimators_values = range(10, 201, 10)

In [23]:
# List to store results
scores = []
best_rmse = np.inf
n_after_which_improvement_stops = None
threshold = 0.001  # Improvement must be more than 0.001 to be significant

for n in n_estimators_values:
    # Train the Random Forest Regressor
    rf_regressor = RandomForestRegressor(
        n_estimators=n,
        random_state=1,
        n_jobs=-1
    )
    rf_regressor.fit(X_train_matrix, y_train)

    # Make predictions on the validation data
    y_val_pred = rf_regressor.predict(X_val_matrix)

    # Calculate RMSE, rounded to 4 decimal places for analysis
    rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
    rmse_rounded = round(rmse, 4)

    scores.append((n, rmse_rounded))

    # Check for improvement against the best RMSE found so far
    # The condition for *significant* improvement is: current_rmse < best_rmse - threshold
    if rmse < best_rmse - threshold:
        best_rmse = rmse
        # The improvement happened *at* this n_estimators.
        # The next iteration will check if the improvement stops *after* this value.
        n_improving_last = n
    elif rmse >= best_rmse - threshold and n_after_which_improvement_stops is None and best_rmse != np.inf:
        # The improvement is not significant (or it got worse) compared to the best.
        # This means improvement stopped *after* the last n that provided a significant improvement.
        n_after_which_improvement_stops = n_improving_last

# If the loop finished and improvement was still significant at the end,
# use the latest iteration number (200) as the answer.
if n_after_which_improvement_stops is None:
    n_after_which_improvement_stops = n_estimators_values[-1]

print("n_estimators vs. Validation RMSE:")
for n, rmse in scores:
    print(f"n_estimators={n}: RMSE={rmse:.4f}")

print(f"\nThe smallest n_estimators after which the RMSE stopped improving (improvement < {threshold}) is: {n_after_which_improvement_stops}")

n_estimators vs. Validation RMSE:
n_estimators=10: RMSE=0.4596
n_estimators=20: RMSE=0.4536
n_estimators=30: RMSE=0.4517
n_estimators=40: RMSE=0.4487
n_estimators=50: RMSE=0.4467
n_estimators=60: RMSE=0.4455
n_estimators=70: RMSE=0.4451
n_estimators=80: RMSE=0.4450
n_estimators=90: RMSE=0.4449
n_estimators=100: RMSE=0.4447
n_estimators=110: RMSE=0.4436
n_estimators=120: RMSE=0.4439
n_estimators=130: RMSE=0.4437
n_estimators=140: RMSE=0.4434
n_estimators=150: RMSE=0.4429
n_estimators=160: RMSE=0.4428
n_estimators=170: RMSE=0.4428
n_estimators=180: RMSE=0.4424
n_estimators=190: RMSE=0.4425
n_estimators=200: RMSE=0.4425

The smallest n_estimators after which the RMSE stopped improving (improvement < 0.001) is: 60


##### The smallest $\text{n\_estimators}$ after which the RMSE stopped improving (improvement $< 0.001$) is 60.

##### Since $\text{n\_estimators}=60$ is not an option, we must look for the closest option after the stabilization point. The closest option to the point where performance stabilizes is c. $80$, which is already past the point of diminishing returns.

### 4. Best max_depth:

* Try different values of max_depth: [10, 15, 20, 25]
* For each of these values,
   * Try different values of n_estimators from 10 till 200 (with step 10)
   * Calculate the mean RMSE
* Fix the random seed: random_state=1

In [31]:
# Parameters to test
max_depths = [10, 15, 20, 25]
n_estimators_range = range(10, 201, 10)

results = {}

for max_d in max_depths:
    rmse_scores = []
    for n in n_estimators_range:
        rf_temp = RandomForestRegressor(
            n_estimators=n,
            max_depth=max_d,
            random_state=1,
            n_jobs=-1
        )
        rf_temp.fit(X_train_matrix, y_train)
        y_pred_temp = rf_temp.predict(X_val_matrix)
        rmse_temp = np.sqrt(mean_squared_error(y_val, y_pred_temp))
        rmse_scores.append(rmse_temp)
    
    mean_rmse = np.mean(rmse_scores)
    results[max_d] = mean_rmse
    print(f"max_depth={max_d}: mean RMSE = {mean_rmse:.3f}")

# Find the best max_depth
best_max_depth = min(results, key=results.get)
print(f"\nBest max_depth: {best_max_depth}")

max_depth=10: mean RMSE = 0.442
max_depth=15: mean RMSE = 0.445
max_depth=20: mean RMSE = 0.446
max_depth=25: mean RMSE = 0.446

Best max_depth: 10


### 5. Feature extraction
* Train the model with these parameters:
  * n_estimators=10,
  * max_depth=20,
  * random_state=1,
  * n_jobs=-1 (optional)
* Get the feature importance information from this model

In [24]:
# 1. Train a Random Forest Regressor with specified parameters
rf_regressor_final = RandomForestRegressor(
    n_estimators=10,
    max_depth=20,
    random_state=1,
    n_jobs=-1
)
rf_regressor_final.fit(X_train_matrix, y_train)

0,1,2
,n_estimators,10
,criterion,'squared_error'
,max_depth,20
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [25]:
# 2. Get the feature importance information
feature_importances = rf_regressor_final.feature_importances_

In [26]:
# 3. Map the feature importance to feature names
feature_names = dv.feature_names_
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importances
})

In [27]:
# 4. Find the most important feature
most_important_feature = importance_df.sort_values(by='importance', ascending=False).iloc[0]

print("Top 5 most important features:")
print(importance_df.sort_values(by='importance', ascending=False).head(5).to_markdown(index=False, numalign="left", stralign="left"))

print(f"\nMost important feature: {most_important_feature['feature']} with importance {most_important_feature['importance']:.4f}")

Top 5 most important features:
| feature             | importance   |
|:--------------------|:-------------|
| vehicle_weight      | 0.95915      |
| horsepower          | 0.0159979    |
| acceleration        | 0.0114797    |
| engine_displacement | 0.00327279   |
| model_year          | 0.0032123    |

Most important feature: vehicle_weight with importance 0.9591


In [28]:
# 5. Check against the options
options_to_check = ['vehicle_weight', 'horsepower', 'acceleration', 'engine_displacement']

# Filter the importance to only include the options
filtered_importance = importance_df[importance_df['feature'].isin(options_to_check)]
most_important_among_options = filtered_importance.sort_values(by='importance', ascending=False).iloc[0]

print("\nMost important feature among the options:")
print(most_important_among_options.to_markdown(index=False, numalign="left", stralign="left"))


Most important feature among the options:
| 13                 |
|:-------------------|
| vehicle_weight     |
| 0.9591499647407432 |


### 6. Train an XGBoost model! For this question, we'll tune the eta parameter:

* Install XGBoost
* Create DMatrix for train and validation
* Create a watchlist
* Train a model with these parameters for 100 rounds

In [36]:
# Note: XGBoost is pre-installed in this environment, so installation is not required.

# 1. Create DMatrix for train and validation
# (Assuming X_train_matrix, y_train, X_val_matrix, y_val are loaded from previous steps)
dtrain = xgb.DMatrix(X_train_matrix, label=y_train)
dval = xgb.DMatrix(X_val_matrix, label=y_val)

In [37]:
# 2. Create a watchlist
watchlist = [(dtrain, 'train'), (dval, 'val')]

In [38]:
# 3. Define the number of rounds
num_boost_round = 100

In [39]:
# --- 4. Train with eta=0.3 ---

# Define parameters for eta=0.3
xgb_params_0_3 = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}

# Dictionary to store evaluation results for eta=0.3
evals_result_0_3 = {}

# Train the model
model_0_3 = xgb.train(
    xgb_params_0_3,
    dtrain,
    num_boost_round=num_boost_round,
    evals=watchlist,
    evals_result=evals_result_0_3,
    verbose_eval=False  # Suppress output for each round
)

# Get the final validation RMSE
rmse_0_3 = evals_result_0_3['val']['rmse'][-1]
print(f"Final Validation RMSE with eta=0.3 (at 100 rounds): {rmse_0_3:.4f}")

Final Validation RMSE with eta=0.3 (at 100 rounds): 0.4502


In [40]:
# --- 5. Train with eta=0.1 ---

# Define parameters for eta=0.1
xgb_params_0_1 = {
    'eta': 0.1, 
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}

# Dictionary to store evaluation results for eta=0.1
evals_result_0_1 = {}

# Train the model
model_0_1 = xgb.train(
    xgb_params_0_1,
    dtrain,
    num_boost_round=num_boost_round,
    evals=watchlist,
    evals_result=evals_result_0_1,
    verbose_eval=False # Suppress output for each round
)

# Get the final validation RMSE
rmse_0_1 = evals_result_0_1['val']['rmse'][-1]
print(f"Final Validation RMSE with eta=0.1 (at 100 rounds): {rmse_0_1:.4f}")

Final Validation RMSE with eta=0.1 (at 100 rounds): 0.4262


In [41]:
# --- 6. Comparison ---
if rmse_0_1 < rmse_0_3:
    print("eta=0.1 leads to the best RMSE score.")
elif rmse_0_3 < rmse_0_1:
    print("eta=0.3 leads to the best RMSE score.")
else:
    print("Both give equal value.")

eta=0.1 leads to the best RMSE score.
