In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('car_fuel_efficiency.csv')
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [3]:
null_cols = df.columns[df.isnull().any()].tolist()
print(null_cols)

['num_cylinders', 'horsepower', 'acceleration', 'num_doors']


In [4]:
df.isnull().sum()

engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64

In [5]:
numeric = ['engine_displacement', 'horsepower', 'vehicle_weight', 'num_doors', 'num_cylinders']
na_cols = ['num_doors', 'num_cylinders', 'horsepower', 'acceleration']
for c in na_cols:
    df[c] = df[c].fillna(0)

In [6]:
df.isnull().sum()

engine_displacement    0
num_cylinders          0
horsepower             0
vehicle_weight         0
acceleration           0
model_year             0
origin                 0
fuel_type              0
drivetrain             0
num_doors              0
fuel_efficiency_mpg    0
dtype: int64

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [9]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [10]:
# y_train = np.log1p(df_train.fuel_efficiency_mpg.values)
# y_val = np.log1p(df_val.fuel_efficiency_mpg.values)
# y_test = np.log1p(df_test.fuel_efficiency_mpg.values)
y_train = df_train.fuel_efficiency_mpg.values
y_val = df_val.fuel_efficiency_mpg.values
y_test = df_test.fuel_efficiency_mpg.values

In [11]:
del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']

In [12]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.tree import export_text

In [13]:
train_dicts = df_train.to_dict(orient='records')
dv = DictVectorizer(sparse=True)
X_train = dv.fit_transform(train_dicts)

In [14]:
dt = DecisionTreeRegressor(max_depth=1)

In [15]:
y_train

array([15.3014754 , 15.33121466, 15.33667895, ..., 15.18828665,
       17.3967514 , 16.16090373], shape=(5822,))

In [16]:
dt.fit(X_train, y_train)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,1
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [17]:
print(export_text(dt, feature_names=list(dv.get_feature_names_out())))

|--- vehicle_weight <= 3022.11
|   |--- value: [16.88]
|--- vehicle_weight >  3022.11
|   |--- value: [12.94]



In [18]:
val_dicts = df_val.to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [19]:
from sklearn.metrics import root_mean_squared_error

In [20]:
from sklearn.ensemble import RandomForestRegressor

In [21]:
rt = RandomForestRegressor(random_state=1, n_estimators=10, n_jobs=-1)

In [22]:
rt.fit(X_train, y_train)

0,1,2
,n_estimators,10
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [23]:
y_pred = rt.predict(X_val)
y_pred

array([18.6542484 , 15.26730256, 18.09860353, ..., 14.83453826,
       13.47299213, 16.19681562], shape=(1941,))

In [24]:
rmse = root_mean_squared_error(y_val, y_pred)
rmse

0.45866154584849067

In [25]:
scores = []
for ne in range(10,210,10):
    rt = RandomForestRegressor(random_state=1, n_estimators=ne)
    rt.fit(X_train, y_train)
    y_pred = rt.predict(X_val)
    rmse = root_mean_squared_error(y_val, y_pred)
    print(ne, round(rmse, 3))
    scores.append(rmse)

print(scores)


10 0.459
20 0.454
30 0.451
40 0.448
50 0.446
60 0.445
70 0.445
80 0.445
90 0.445
100 0.445
110 0.444
120 0.444
130 0.444
140 0.444
150 0.443
160 0.443
170 0.443
180 0.443
190 0.443
200 0.443
[0.45866154584849067, 0.45367991021440807, 0.45117160299870146, 0.4483573590280684, 0.4461792293825761, 0.44529984913635234, 0.44467446277442296, 0.4449936429447336, 0.44520450225183406, 0.4448958321700875, 0.4437175613239899, 0.44410058154615867, 0.4437729440618521, 0.4435015066774846, 0.4430200410161778, 0.44278873172443844, 0.4428940618484624, 0.44254812253330855, 0.442606534659465, 0.44251983445804555]


In [26]:
max_depths = [5, 10, 15, 20, 25]
final_ne = 10
final_rmse = 1000.0
final_depth = 1
for ne in range(10,210,10):
    best_depth = 1
    best_rmse = 1000.0
    print("ne:", ne)
    for depth in max_depths:
        rt = RandomForestRegressor(max_depth=depth, random_state=1, n_estimators=ne)
        rt.fit(X_train, y_train)
        y_pred = rt.predict(X_val)
        rmse = root_mean_squared_error(y_val, y_pred)
        rmse = round(rmse,3)
        if (rmse < best_rmse):
            best_rmse = rmse
            best_depth = depth
    if best_rmse < final_rmse:
        final_ne = ne
        final_depth = best_depth
        final_rmse = best_rmse
    #print("ne:", ne, "depth:", best_depth, "rmse:", best_rmse)
print(final_ne, final_depth, final_rmse)
    

ne: 10
ne: 20
ne: 30
ne: 40
ne: 50
ne: 60
ne: 70
ne: 80
ne: 90
ne: 100
ne: 110
ne: 120
ne: 130
ne: 140
ne: 150
ne: 160
ne: 170
ne: 180
ne: 190
ne: 200
140 10 0.44


In [27]:
estimators=10
depth=20
rt = RandomForestRegressor(random_state=1, n_estimators=estimators, max_depth=20, n_jobs=-1)
rt.fit(X_train, y_train)
y_pred = rt.predict(X_val)
rmse = root_mean_squared_error(y_val, y_pred)
print(ne, round(rmse, 3))

200 0.459


In [28]:
rt.feature_importances_

array([1.14896605e-02, 3.51259186e-04, 3.39119766e-04, 3.27947028e-03,
       3.36412907e-04, 3.50440533e-04, 1.60658310e-02, 3.17018227e-03,
       2.33341526e-03, 1.61769744e-03, 4.53975778e-04, 5.12930005e-04,
       5.46431327e-04, 9.59153174e-01])

In [29]:
np.shape(rt.feature_importances_)

(14,)

In [30]:
rt.n_features_in_

14

In [31]:
rt.n_features_in_

14

In [32]:
df_train.columns

Index(['engine_displacement', 'num_cylinders', 'horsepower', 'vehicle_weight',
       'acceleration', 'model_year', 'origin', 'fuel_type', 'drivetrain',
       'num_doors'],
      dtype='object')

In [33]:
# feature_names = df_train.columns
# print(np.shape(feature_names))
# importances = rt.feature_importances_
# print(np.shape(importances))
# feature_importance_df = pd.DataFrame({
#     'feature': feature_names,
#     'importance': importances
# }).sort_values(by='importance', ascending=False)
print(np.round(np.array(rt.feature_importances_), 3))
print(type(rt.feature_importances_))

[0.011 0.    0.    0.003 0.    0.    0.016 0.003 0.002 0.002 0.    0.001
 0.001 0.959]
<class 'numpy.ndarray'>


In [34]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestRegressor

dv = DictVectorizer(sparse=True)

X_train_dicts = df_train.to_dict(orient='records')
X_val_dicts = df_val.to_dict(orient='records')

print(np.shape(X_train_dicts))
print(np.shape(df_train))
      
X_train = dv.fit_transform(X_train_dicts)
print(export_text(dt, feature_names=list(dv.get_feature_names_out())))
print(np.shape(X_train))
X_val = dv.transform(X_val_dicts)   # <-- use transform only, not fit_transform

model = RandomForestRegressor(random_state=1, n_estimators=10, max_depth=20, n_jobs=-1)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)


(5822,)
(5822, 10)
|--- vehicle_weight <= 3022.11
|   |--- value: [16.88]
|--- vehicle_weight >  3022.11
|   |--- value: [12.94]

(5822, 14)


In [35]:
X_train

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 58220 stored elements and shape (5822, 14)>

In [36]:
model.feature_importances_

array([1.14896605e-02, 3.51259186e-04, 3.39119766e-04, 3.27947028e-03,
       3.36412907e-04, 3.50440533e-04, 1.60658310e-02, 3.17018227e-03,
       2.33341526e-03, 1.61769744e-03, 4.53975778e-04, 5.12930005e-04,
       5.46431327e-04, 9.59153174e-01])

In [37]:
np.shape(X_train)

(5822, 14)

In [38]:
# Get expanded feature names and importances
feature_names = dv.get_feature_names_out()
importances = model.feature_importances_

# Combine into a DataFrame
fi_df = pd.DataFrame({'feature': feature_names, 'importance': importances})

In [39]:
fi_df

Unnamed: 0,feature,importance
0,acceleration,0.01149
1,drivetrain=All-wheel drive,0.000351
2,drivetrain=Front-wheel drive,0.000339
3,engine_displacement,0.003279
4,fuel_type=Diesel,0.000336
5,fuel_type=Gasoline,0.00035
6,horsepower,0.016066
7,model_year,0.00317
8,num_cylinders,0.002333
9,num_doors,0.001618


In [41]:
!uv pip install xgboost

[2mUsing Python 3.13.7 environment at: zoomenv[0m
[2mAudited [1m1 package[0m [2min 20ms[0m[0m


In [42]:
import xgboost as xgb

In [52]:
features = dv.get_feature_names_out().tolist()
print(type(features))
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)

<class 'list'>


In [53]:
xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

In [54]:
model = xgb.train(xgb_params, dtrain, num_boost_round=100)

In [55]:
y_pred = model.predict(dval)

In [60]:
rmse = root_mean_squared_error(y_val, y_pred)
rmse

0.45017755678087246

In [61]:
xgb_params = {
    'eta': 0.1, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}
model = xgb.train(xgb_params, dtrain, num_boost_round=100)
y_pred = model.predict(dval)
rmse = root_mean_squared_error(y_val, y_pred)
rmse

0.42622800553359225