In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_text
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

In [2]:
!wget = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv'

--2025-11-02 18:52:21--  http://=/
Resolving = (=)... failed: Name or service not known.
wget: unable to resolve host address ‘=’
--2025-11-02 18:52:21--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 874188 (854K) [text/plain]
Saving to: ‘car_fuel_efficiency.csv’


2025-11-02 18:52:21 (56.5 MB/s) - ‘car_fuel_efficiency.csv’ saved [874188/874188]

FINISHED --2025-11-02 18:52:21--
Total wall clock time: 0.09s
Downloaded: 1 files, 854K in 0.01s (56.5 MB/s)


In [2]:
#data preperation and cleanup
df= pd.read_csv('car_fuel_efficiency.csv')
df.dtypes
num_cols = df.select_dtypes(include=['number']).columns
df[num_cols]= df[num_cols].fillna(0.0)
df.isna().sum()

engine_displacement    0
num_cylinders          0
horsepower             0
vehicle_weight         0
acceleration           0
model_year             0
origin                 0
fuel_type              0
drivetrain             0
num_doors              0
fuel_efficiency_mpg    0
dtype: int64

In [3]:
#split the data for train,test and validation
#60% train ,20% validation, 20% test, random_state=1
df_train_data_full, df_test_data = train_test_split(df, test_size=0.2, random_state=1)
df_train_data, df_val_data = train_test_split(df_train_data_full, test_size=0.25,random_state=1)  
#extract and remove the target values from the data split
y_train= df_train_data.fuel_efficiency_mpg.values
y_val= df_val_data.fuel_efficiency_mpg.values
y_test = df_test_data.fuel_efficiency_mpg.values
del df_train_data['fuel_efficiency_mpg']
del df_test_data['fuel_efficiency_mpg']
del df_val_data['fuel_efficiency_mpg']

In [4]:
df_train_data.shape, df_val_data.shape, df_test_data.shape

((5822, 10), (1941, 10), (1941, 10))

In [12]:
#use dictvictorizer to create a sparse matrix for training a decision tree regression
dv = DictVectorizer(sparse=True)
train_dict = df_train_data.to_dict(orient='records')
val_dict = df_val_data.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)
X_val = dv.fit_transform(val_dict)

In [6]:
#use decision tree classifier to train the model
dt =DecisionTreeRegressor(max_depth=1)
dt.fit(X_train,y_train)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,1
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [9]:
#get the feature name for the 'Decision stump' we created
print(export_text(dt, feature_names=dv.get_feature_names_out()))

|--- vehicle_weight <= 3022.11
|   |--- value: [16.88]
|--- vehicle_weight >  3022.11
|   |--- value: [12.94]



In [None]:
# the feature that is being used in th decision tree is "vehicle_weight"

In [7]:
#train Random Forest Regressor
rf = RandomForestRegressor(n_estimators=10,random_state=1, n_jobs =-1)
rf.fit(X_train,y_train)
y_pred = rf.predict(X_val)

In [21]:
#find the RMS values between y_pred And y_val
def RMSE_func(y_target,y_pred):
    se = (y_target-y_pred)**2
    mse = se.mean()
    return np.sqrt(mse)
RMS = RMSE_func(y_val,y_pred)
print('RMSE:',round(RMS,3))

RMSE: 0.46


In [13]:
# experiment with the n_estimators values
estimates = np.arange(10,200,10)
def RandomForest_func(estimate):
    rf = RandomForestRegressor(n_estimators= estimate,random_state=1,n_jobs=-1)
    rf.fit(X_train,y_train)
    y_pred= rf.predict(X_val)
    RMS = RMSE_func(y_val,y_pred)
    print('RMSE:',round(RMS,3))
    
for estimate in estimates:
    print('estimate:', estimate)
    RandomForest_func(estimate)

estimate: 10
RMSE: 0.46
estimate: 20
RMSE: 0.454
estimate: 30
RMSE: 0.452
estimate: 40
RMSE: 0.449
estimate: 50
RMSE: 0.447
estimate: 60
RMSE: 0.445
estimate: 70
RMSE: 0.445
estimate: 80
RMSE: 0.445
estimate: 90
RMSE: 0.445
estimate: 100
RMSE: 0.445
estimate: 110
RMSE: 0.444
estimate: 120
RMSE: 0.444
estimate: 130
RMSE: 0.444
estimate: 140
RMSE: 0.443
estimate: 150
RMSE: 0.443
estimate: 160
RMSE: 0.443
estimate: 170
RMSE: 0.443
estimate: 180
RMSE: 0.442
estimate: 190
RMSE: 0.442


In [None]:
#given apporximately around 200, the RMSE values started to flatten and not imporve

In [None]:
#explore different max_depth parameters and comare the RMSE values

In [15]:
max_depths =[10,15,20,25]
def Randomforest_func_with_depth(estimate,max_depth):
    rf = RandomForestRegressor(n_estimators= estimate, max_depth=max_depth, random_state =1, n_jobs=-1)
    rf.fit(X_train,y_train)
    y_pred = rf.predict(X_val)
    RMSE = RMSE_func(y_val,y_pred)
    print('RMSE:',round(RMSE,3))
for max_depth in max_depths:
    for estimate in estimates:
        print('estimate:','max_depth', estimate, max_depth)
        Randomforest_func_with_depth(estimate,max_depth)

estimate: max_depth 10 10
RMSE: 0.45
estimate: max_depth 20 10
RMSE: 0.447
estimate: max_depth 30 10
RMSE: 0.445
estimate: max_depth 40 10
RMSE: 0.443
estimate: max_depth 50 10
RMSE: 0.442
estimate: max_depth 60 10
RMSE: 0.442
estimate: max_depth 70 10
RMSE: 0.441
estimate: max_depth 80 10
RMSE: 0.441
estimate: max_depth 90 10
RMSE: 0.442
estimate: max_depth 100 10
RMSE: 0.441
estimate: max_depth 110 10
RMSE: 0.441
estimate: max_depth 120 10
RMSE: 0.441
estimate: max_depth 130 10
RMSE: 0.441
estimate: max_depth 140 10
RMSE: 0.44
estimate: max_depth 150 10
RMSE: 0.44
estimate: max_depth 160 10
RMSE: 0.44
estimate: max_depth 170 10
RMSE: 0.44
estimate: max_depth 180 10
RMSE: 0.44
estimate: max_depth 190 10
RMSE: 0.44
estimate: max_depth 10 15
RMSE: 0.458
estimate: max_depth 20 15
RMSE: 0.453
estimate: max_depth 30 15
RMSE: 0.451
estimate: max_depth 40 15
RMSE: 0.449
estimate: max_depth 50 15
RMSE: 0.446
estimate: max_depth 60 15
RMSE: 0.445
estimate: max_depth 70 15
RMSE: 0.445
estimate:

In [None]:
#0.441 is the minimum RMSE which is achieved at max_depth =10

In [9]:
# train a decision tree model with specific parameter and find the feature importance 
#n_estimators=10,max_depth=20,random_state=1,n_jobs=-1 (optional)
rf = RandomForestRegressor(n_estimators=10,max_depth =20,random_state=1, n_jobs =-1)
rf.fit(X_train,y_train)

0,1,2
,n_estimators,10
,criterion,'squared_error'
,max_depth,20
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [16]:
#get feature importances
importances = rf.feature_importances_
# show as dataframe
feature_importance_df = pd.DataFrame({
    'Feature': dv.get_feature_names_out(),
    'Importance': importances
}).sort_values(by='Importance', ascending=False)
print(feature_importance_df)

                         Feature  Importance
13                vehicle_weight    0.959150
6                     horsepower    0.015998
0                   acceleration    0.011480
3            engine_displacement    0.003273
7                     model_year    0.003212
8                  num_cylinders    0.002343
9                      num_doors    0.001635
12                    origin=USA    0.000540
11                 origin=Europe    0.000519
10                   origin=Asia    0.000462
5             fuel_type=Gasoline    0.000360
1     drivetrain=All-wheel drive    0.000357
2   drivetrain=Front-wheel drive    0.000345
4               fuel_type=Diesel    0.000325


In [None]:
# according to the featureImportance dataframe, vehicle weight is the most important dataset

In [26]:
#Train XGBoost Model and do performance tuning with the eta 0.3 perimeter 
features = dv.get_feature_names_out().tolist()
dtrain = xgb.DMatrix(X_train, label= y_train, feature_names = features)
dval = xgb.DMatrix(X_val, label =y_val, feature_names = features)
xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}
model = xgb.train(xgb_params,dtrain,num_boost_round=100)
y_predict = model.predict(dval)
round(RMSE_func(y_val,y_predict),3)

np.float64(0.45)

In [27]:
#Train XGBoost Model and do performance tuning with the eta 0.1 perimeter 
features = dv.get_feature_names_out().tolist()
dtrain = xgb.DMatrix(X_train, label= y_train, feature_names = features)
dval = xgb.DMatrix(X_val, label =y_val, feature_names = features)
xgb_params = {
    'eta': 0.1, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}
model = xgb.train(xgb_params,dtrain,num_boost_round=100)
y_predict = model.predict(dval)
round(RMSE_func(y_val,y_predict),3)

np.float64(0.426)

In [None]:
# According to the values calculated eta 0.1 gives a better RMSE values