In [2]:
import pandas as pd

# Load the dataset
df = pd.read_csv('E:\\HCL_Technical_Lead\\SIE_OSIV_HCL_TRAINING\\PYTHON_DS_DE\Machine_learning_zoomcamp_DATAtalks\\week-06-trees\\data\\jamb_exam_results.csv')

# Clean column names
df.columns = df.columns.str.lower().str.replace(' ', '_')

# Drop student_id and fill missing values
df = df.drop(columns=['student_id']).fillna(0)


In [3]:
from sklearn.model_selection import train_test_split

# 60/20/20 split
df_train, df_temp = train_test_split(df, test_size=0.4, random_state=1)
df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=1)

# Separate target variable 'jamb_score'
y_train = df_train.jamb_score.values
y_val = df_val.jamb_score.values
y_test = df_test.jamb_score.values

# Drop target variable from the features
df_train = df_train.drop(columns=['jamb_score'])
df_val = df_val.drop(columns=['jamb_score'])
df_test = df_test.drop(columns=['jamb_score'])


In [4]:
from sklearn.feature_extraction import DictVectorizer

# Convert data to dictionaries for DictVectorizer
train_dict = df_train.to_dict(orient='records')
val_dict = df_val.to_dict(orient='records')
test_dict = df_test.to_dict(orient='records')

# Use DictVectorizer
dv = DictVectorizer(sparse=True)
X_train = dv.fit_transform(train_dict)
X_val = dv.transform(val_dict)
X_test = dv.transform(test_dict)


In [5]:
from sklearn.tree import DecisionTreeRegressor

# Train model with max_depth=1
dt = DecisionTreeRegressor(max_depth=1, random_state=1)
dt.fit(X_train, y_train)

# Find the feature used for splitting
feature_names = dv.get_feature_names_out()
feature_used = feature_names[dt.tree_.feature[0]]
print("Feature used for splitting:", feature_used)


Feature used for splitting: study_hours_per_week


In [6]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Train a RandomForestRegressor
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

# Predict and calculate RMSE
y_pred = rf.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print("RMSE:", rmse)


RMSE: 43.157758977963624


In [7]:
rmse_values = []
for n in range(10, 201, 10):
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    rmse_values.append((n, rmse))

# Find n_estimators where RMSE stops improving
print("n_estimators and RMSE values:", rmse_values)


n_estimators and RMSE values: [(10, 43.157758977963624), (20, 41.79043981582391), (30, 41.555818472133225), (40, 41.075631652173044), (50, 40.9571573818301), (60, 40.77368529456223), (70, 40.587805985220214), (80, 40.5027042403498), (90, 40.43492224596255), (100, 40.36491034549687), (110, 40.347525479439874), (120, 40.30191844844362), (130, 40.285789466741), (140, 40.26346078629849), (150, 40.25426440073703), (160, 40.1996656828838), (170, 40.187325737485885), (180, 40.13596272032919), (190, 40.15216599857013), (200, 40.138465594427)]


In [8]:
depth_rmse = {}
for depth in [10, 15, 20, 25]:
    rmse_list = []
    for n in range(10, 201, 10):
        rf = RandomForestRegressor(n_estimators=n, max_depth=depth, random_state=1, n_jobs=-1)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        rmse_list.append(rmse)
    depth_rmse[depth] = np.mean(rmse_list)

# Find the best max_depth
best_depth = min(depth_rmse, key=depth_rmse.get)
print("Best max_depth:", best_depth)


Best max_depth: 10


In [9]:
# Train model with given parameters
rf = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

# Get feature importances
importances = rf.feature_importances_
feature_importances = dict(zip(dv.get_feature_names_out(), importances))
most_important_feature = max(feature_importances, key=feature_importances.get)
print("Most important feature:", most_important_feature)


Most important feature: study_hours_per_week


In [10]:
import xgboost as xgb

# Prepare DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

# Watchlist for evaluation
watchlist = [(dtrain, 'train'), (dval, 'eval')]

# Parameters for eta=0.3
params_03 = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1
}
model_03 = xgb.train(params_03, dtrain, 100, evals=watchlist, early_stopping_rounds=10, verbose_eval=False)

# Parameters for eta=0.1
params_01 = params_03.copy()
params_01['eta'] = 0.1
model_01 = xgb.train(params_01, dtrain, 100, evals=watchlist, early_stopping_rounds=10, verbose_eval=False)

# Retrieve the best scores for each model
best_rmse_03 = model_03.best_score
best_rmse_01 = model_01.best_score

# Compare results
print("Best RMSE with eta=0.3:", best_rmse_03)
print("Best RMSE with eta=0.1:", best_rmse_01)


Best RMSE with eta=0.3: 40.68821969954672
Best RMSE with eta=0.1: 40.166449496198915
