In [11]:
import pandas as pd
import numpy as np

In [90]:
df = pd.read_csv('../car_fuel_efficiency.csv')
df

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.870990,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369
...,...,...,...,...,...,...,...,...,...,...,...
9699,140,5.0,164.0,2981.107371,17.3,2013,Europe,Diesel,Front-wheel drive,,15.101802
9700,180,,154.0,2439.525729,15.0,2004,USA,Gasoline,All-wheel drive,0.0,17.962326
9701,220,2.0,138.0,2583.471318,15.1,2008,USA,Diesel,All-wheel drive,-1.0,17.186587
9702,230,4.0,177.0,2905.527390,19.4,2011,USA,Diesel,Front-wheel drive,1.0,15.331551


In [91]:
numerical = ["engine_displacement", "num_cylinders", "horsepower", "vehicle_weight", "acceleration", "model_year", "num_doors"]
categorical = ["origin", "fuel_type", "drivetrain"]

In [92]:
df[numerical] = df[numerical].fillna(0)
df[categorical] = df[categorical].fillna("NA")

In [93]:
from sklearn.model_selection import train_test_split

In [94]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1)

In [95]:
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=1)

In [99]:
y_train_full = df_train_full.fuel_efficiency_mpg.values
y_train = df_train.fuel_efficiency_mpg.values
y_val = df_val.fuel_efficiency_mpg.values
y_test = df_test.fuel_efficiency_mpg.values

del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']

In [100]:
train_dict = df_train[categorical + numerical].to_dict(orient='records')
val_dict = df_val[categorical + numerical].to_dict(orient='records')

In [101]:
from sklearn.feature_extraction import DictVectorizer

In [102]:
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

0,1,2
,dtype,<class 'numpy.float64'>
,separator,'='
,sparse,False
,sort,True


In [103]:
X_train = dv.transform(train_dict)
X_val =  dv.transform(val_dict)

Q1

In [109]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import roc_auc_score

In [119]:
dt = DecisionTreeRegressor(max_depth=1)
dt.fit(X_train, y_train)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,1
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [118]:
y_pred = dt.predict(X_val)

In [112]:
from sklearn.tree import export_text

In [113]:
print(export_text(dt, feature_names=dv.get_feature_names_out()))

|--- vehicle_weight <= 3022.11
|   |--- value: [16.88]
|--- vehicle_weight >  3022.11
|   |--- value: [12.94]



Q2

In [128]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [132]:
model = RandomForestRegressor(
    n_estimators=10,
    random_state=1,
    n_jobs=-1
)

# Train the model
model.fit(X_train, y_train)

0,1,2
,n_estimators,10
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [135]:
y_pred = model.predict(X_val)

# Calculate evaluation metrics
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)

print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"RMSE: {rmse:.4f}")

Mean Squared Error (MSE): 0.2089
RMSE: 0.4571


Q3

In [150]:
rmses = []

for i in range(10, 201, 10):
    rf = RandomForestRegressor(n_estimators=i, random_state=1, n_jobs=-1)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)
    mse = mean_squared_error(y_val, y_pred)
    rmse = np.sqrt(mse)
    print('%s -> %.3f' % (i, rmse))
    rmses.append(rmse)

10 -> 0.457
20 -> 0.452
30 -> 0.450
40 -> 0.448
50 -> 0.446
60 -> 0.445
70 -> 0.445
80 -> 0.445
90 -> 0.444
100 -> 0.444
110 -> 0.443
120 -> 0.443
130 -> 0.443
140 -> 0.443
150 -> 0.442
160 -> 0.442
170 -> 0.442
180 -> 0.442
190 -> 0.442
200 -> 0.442


Q4

In [155]:
all_rmses = {}

for depth in [10, 15, 20, 25]:
    print('depth: %s' % depth)
    rmses = []

    for i in range(10, 201, 10):
        rf = RandomForestRegressor(n_estimators=i, max_depth=depth, random_state=1, n_jobs=-1)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)
        mse = mean_squared_error(y_val, y_pred)
        rmse = np.sqrt(mse)
        print('%s -> %.3f' % (i, rmse))
        rmses.append(rmse)

    print("mean")
    print(np.mean(rmses))

    all_rmses[depth] = rmses    
    print()

depth: 10
10 -> 0.451
20 -> 0.448
30 -> 0.445
40 -> 0.443
50 -> 0.442
60 -> 0.441
70 -> 0.441
80 -> 0.441
90 -> 0.441
100 -> 0.441
110 -> 0.440
120 -> 0.441
130 -> 0.440
140 -> 0.440
150 -> 0.440
160 -> 0.440
170 -> 0.440
180 -> 0.440
190 -> 0.440
200 -> 0.440
mean
0.44177370392969184

depth: 15
10 -> 0.457
20 -> 0.452
30 -> 0.450
40 -> 0.448
50 -> 0.446
60 -> 0.445
70 -> 0.444
80 -> 0.445
90 -> 0.444
100 -> 0.444
110 -> 0.443
120 -> 0.443
130 -> 0.443
140 -> 0.443
150 -> 0.442
160 -> 0.442
170 -> 0.442
180 -> 0.442
190 -> 0.442
200 -> 0.442
mean
0.4450240433962002

depth: 20
10 -> 0.458
20 -> 0.453
30 -> 0.452
40 -> 0.449
50 -> 0.447
60 -> 0.446
70 -> 0.445
80 -> 0.445
90 -> 0.445
100 -> 0.445
110 -> 0.444
120 -> 0.444
130 -> 0.444
140 -> 0.444
150 -> 0.443
160 -> 0.443
170 -> 0.443
180 -> 0.443
190 -> 0.443
200 -> 0.443
mean
0.4459547840883615

depth: 25
10 -> 0.457
20 -> 0.452
30 -> 0.451
40 -> 0.448
50 -> 0.447
60 -> 0.445
70 -> 0.445
80 -> 0.445
90 -> 0.445
100 -> 0.445
110 -> 0.4

Q5

In [157]:
model = RandomForestRegressor(
    n_estimators=10,
    max_depth=20,
    random_state=1,
    n_jobs=-1
)

# Train the model
model.fit(X_train, y_train)

0,1,2
,n_estimators,10
,criterion,'squared_error'
,max_depth,20
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [159]:
print(model.feature_importances_)

[1.14536287e-02 3.96650944e-04 3.11283259e-04 3.28170182e-03
 3.69466267e-04 3.01551010e-04 1.60423301e-02 3.16452511e-03
 2.33145848e-03 1.62794233e-03 4.85137573e-04 4.93652614e-04
 5.64584351e-04 9.59176087e-01]


In [165]:
importances = model.feature_importances_

feature_names = dv.get_feature_names_out()

# Print or visualize the importances
for i, importance in enumerate(importances):
    print(f"Feature {feature_names[i]}: {importance:.4f}")

Feature acceleration: 0.0115
Feature drivetrain=All-wheel drive: 0.0004
Feature drivetrain=Front-wheel drive: 0.0003
Feature engine_displacement: 0.0033
Feature fuel_type=Diesel: 0.0004
Feature fuel_type=Gasoline: 0.0003
Feature horsepower: 0.0160
Feature model_year: 0.0032
Feature num_cylinders: 0.0023
Feature num_doors: 0.0016
Feature origin=Asia: 0.0005
Feature origin=Europe: 0.0005
Feature origin=USA: 0.0006
Feature vehicle_weight: 0.9592


Q6

In [171]:
!pip install xgboost
import xgboost as xgb

Collecting xgboost
  Using cached xgboost-3.1.1-py3-none-macosx_12_0_arm64.whl.metadata (2.1 kB)
Using cached xgboost-3.1.1-py3-none-macosx_12_0_arm64.whl (2.2 MB)
Installing collected packages: xgboost
Successfully installed xgboost-3.1.1


In [175]:
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=dv.feature_names_)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=dv.feature_names_)

In [176]:
watchlist = [(dtrain, 'train'), (dval, 'val')]

In [198]:
xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

In [199]:
%%capture output

model = xgb.train(xgb_params, dtrain,
                  num_boost_round=100,
                  evals=watchlist, verbose_eval=1)

In [200]:
def parse_xgb_output(output):
    tree = []
    aucs_train = []
    aucs_val = []

    for line in output.stdout.strip().split('\n'):
        it_line, train_line, val_line = line.split('\t')

        it = int(it_line.strip('[]'))
        train = float(train_line.split(':')[1])
        val = float(val_line.split(':')[1])

        tree.append(it)
        aucs_train.append(train)
        aucs_val.append(val)

    return tree, aucs_train, aucs_val

In [201]:
tree, _, aucs_val_eta_03 = parse_xgb_output(output)
print(max(aucs_val_eta_03))

1.85444


In [202]:
xgb_params = {
    'eta': 0.1, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

In [203]:
%%capture output

model = xgb.train(xgb_params, dtrain,
                  num_boost_round=100,
                  evals=watchlist, verbose_eval=1)

In [204]:
tree, _, aucs_val_eta_01 = parse_xgb_output(output)
print(max(aucs_val_eta_01))

2.34561
