In [1]:
import pandas as pd
import numpy as np

import requests

import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_text
from sklearn.ensemble import RandomForestRegressor

import xgboost as xgb

In [3]:
url = "https://github.com/alexeygrigorev/datasets/raw/refs/heads/master/jamb_exam_results.csv"
response = requests.get(url)

# saving
with open("jamb_exam_results.csv", "wb") as file:
    file.write(response.content)

df = pd.read_csv("jamb_exam_results.csv")
df.head(2)

Unnamed: 0,JAMB_Score,Study_Hours_Per_Week,Attendance_Rate,Teacher_Quality,Distance_To_School,School_Type,School_Location,Extra_Tutorials,Access_To_Learning_Materials,Parent_Involvement,IT_Knowledge,Student_ID,Age,Gender,Socioeconomic_Status,Parent_Education_Level,Assignments_Completed
0,192,22,78,4,12.4,Public,Urban,Yes,Yes,High,Medium,1,17,Male,Low,Tertiary,2
1,207,14,88,4,2.7,Public,Rural,No,Yes,High,High,2,15,Male,High,,1


In [4]:
# cleaning df
df.columns = df.columns.str.lower().str.replace(' ', '_')

strings = list(df.dtypes[df.dtypes == 'object'].index)
strings

for col in strings:
    df[col] = df[col].str.lower().str.replace(' ', '_')

# remove student id
df.drop(columns=['student_id'], inplace=True)

# missing values
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])
df.fillna(0, inplace=True)

parent_education_level    891
dtype: int64


In [7]:
# unique value of each column
for col in df.columns:
    print(col)
    print(df[col].nunique())

jamb_score
220
study_hours_per_week
41
attendance_rate
51
teacher_quality
5
distance_to_school
201
school_type
2
school_location
2
extra_tutorials
2
access_to_learning_materials
2
parent_involvement
3
it_knowledge
3
age
8
gender
2
socioeconomic_status
3
parent_education_level
4
assignments_completed
5


In [9]:
df.head(2)

Unnamed: 0,jamb_score,study_hours_per_week,attendance_rate,teacher_quality,distance_to_school,school_type,school_location,extra_tutorials,access_to_learning_materials,parent_involvement,it_knowledge,age,gender,socioeconomic_status,parent_education_level,assignments_completed
0,192,22,78,4,12.4,public,urban,yes,yes,high,medium,17,male,low,tertiary,2
1,207,14,88,4,2.7,public,rural,no,yes,high,high,15,male,high,0,1


In [11]:
jamb_scores = df['jamb_score']

# eda 
max_score = jamb_scores.max()
min_score = jamb_scores.min()
mean_score = jamb_scores.mean()
std_score = jamb_scores.std()

print(f"Max: {max_score}")
print(f"Min: {min_score}")
print(f"Mean: {mean_score}")
print(f"Standard Deviation: {std_score}")

Max: 367
Min: 100
Mean: 174.0746
Standard Deviation: 47.6164771966609


In [13]:
# splitting df
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [15]:
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)

In [17]:
y_train = df_train.jamb_score.values
y_val = df_val.jamb_score.values
y_test = df_test.jamb_score.values

# Remove target from features
del df_train['jamb_score']
del df_val['jamb_score']
del df_test['jamb_score']

In [19]:
# convert df to dictionaries
train_dict = df_train.to_dict(orient='records')
val_dict = df_val.to_dict(orient='records')
test_dict = df_test.to_dict(orient='records')

# initialize DictVectorizer
dv = DictVectorizer(sparse=True)

# fit and transform 
X_train = dv.fit_transform(train_dict)

# transform validation and test data
X_val = dv.transform(val_dict)
X_test = dv.transform(test_dict)

In [21]:
# Question 1

In [23]:
# initialize model
dt = DecisionTreeRegressor(max_depth=1, random_state=1)

#train
dt.fit(X_train, y_train)

#showing split feature
feature_index = dt.tree_.feature[0]
split_feature = dv.feature_names_[feature_index]

print(f"Feature used to split data: {split_feature}")

Feature used to split data: study_hours_per_week


In [25]:
export_text(dt)

'|--- feature_27 <= 18.50\n|   |--- value: [155.24]\n|--- feature_27 >  18.50\n|   |--- value: [188.59]\n'

In [27]:
# Question 2

In [29]:
# defining model
model = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
model.fit(X_train, y_train)

# train model
y_pred = model.predict(X_val)

# calculate rmse for the validation set
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)
    
# or rmse_val = np.sqrt(mean_squared_error(y_val, y_pred_val))

rmse_val = rmse(y_val, y_pred)

print(f"The RMSE of this model on validation is: {rmse_val:.3f}")

The RMSE of this model on validation is: 42.137


In [31]:
# Question 3

In [33]:
#list for saving results
results = []

# loop for n_estimators
for n in range(10, 201, 10):
    # model
    model = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    
    # train
    model.fit(X_train, y_train)
    
    # predicting and validating
    y_pred_val = model.predict(X_val)
    
    # rmse
    rmse_val = np.sqrt(mean_squared_error(y_val, y_pred_val))
    
    # saving in list
    results.append((n, rmse_val))
    print(f"n_estimators: {n}, RMSE: {rmse_val:.3f}")

# sort and print best parameter
best_n, best_rmse = min(results, key=lambda x: x[1])
print(f"\nBest parameter: n_estimators={best_n} with RMSE={best_rmse:.3f}")


n_estimators: 10, RMSE: 42.137
n_estimators: 20, RMSE: 41.461
n_estimators: 30, RMSE: 41.106
n_estimators: 40, RMSE: 40.917
n_estimators: 50, RMSE: 40.852
n_estimators: 60, RMSE: 40.784
n_estimators: 70, RMSE: 40.677
n_estimators: 80, RMSE: 40.539
n_estimators: 90, RMSE: 40.504
n_estimators: 100, RMSE: 40.517
n_estimators: 110, RMSE: 40.593
n_estimators: 120, RMSE: 40.625
n_estimators: 130, RMSE: 40.651
n_estimators: 140, RMSE: 40.595
n_estimators: 150, RMSE: 40.597
n_estimators: 160, RMSE: 40.604
n_estimators: 170, RMSE: 40.628
n_estimators: 180, RMSE: 40.641
n_estimators: 190, RMSE: 40.631
n_estimators: 200, RMSE: 40.601

Best parameter: n_estimators=90 with RMSE=40.504


In [34]:
# Maximum score in the dataset
max_score = 367  

# Minimum score in the dataset
min_score = 100  

# Mean (average) score in the dataset
mean_score = 174.0746  

# Standard deviation of the scores
std_dev = 47.6165  

# RMSE value from the model
rmse_val = 40.504 
# Compare RMSE to Mean
# RMSE < Mean indicates that the model predictions are, on average, accurate.
# The model's predictions are typically within 40 points of the average score.

# Compare RMSE to Standard Deviation
# RMSE < Standard Deviation suggests that predictions are reasonably close to actual scores
# relative to the variability in the scores.
# Range of scores
score_range = max_score - min_score  # Range = 367 - 100 = 267

# Percentage of RMSE in relation to the score range
rmse_percentage = (rmse_val / score_range) * 100  # RMSE is approximately 15.17% of the score range

# Conclusion:
# The RMSE of 40.504 is low relative to the score range and mean, indicating good model performance.
# Model predictions are reliable, but further tuning could improve accuracy if desired.

In [35]:
# Question 4

In [36]:
# list for results
results = []

# loop for max_depth and storing rmse for each n_estimators
for depth in [10, 15, 20, 25]:
    rmse_list = [] 

    # loop for n_estimators
    for n in range(10, 201, 10):
        # model, train, pred
        model = RandomForestRegressor(n_estimators=n, max_depth=depth, random_state=1, n_jobs=-1)
        model.fit(X_train, y_train)
        y_pred_val = model.predict(X_val)
        
        # calculate and save rmse for this specific max_depth and n_estimators
        rmse_val = np.sqrt(mean_squared_error(y_val, y_pred_val))
        rmse_list.append(rmse_val)
        print(f"max_depth: {depth}, n_estimators: {n}, RMSE: {rmse_val:.3f}")

    # calculate mean rmse for this max_depth
    mean_rmse = np.mean(rmse_list)
    results.append((depth, mean_rmse))
    print(f"Mean RMSE for max_depth {depth}: {mean_rmse:.3f}")

# best max_depth based on mean rmse
best_depth, best_mean_rmse = min(results, key=lambda x: x[1])
print(f"\nBest max_depth: {best_depth} with Mean RMSE: {best_mean_rmse:.3f}")


max_depth: 10, n_estimators: 10, RMSE: 41.258
max_depth: 10, n_estimators: 20, RMSE: 40.881
max_depth: 10, n_estimators: 30, RMSE: 40.625
max_depth: 10, n_estimators: 40, RMSE: 40.270
max_depth: 10, n_estimators: 50, RMSE: 40.317
max_depth: 10, n_estimators: 60, RMSE: 40.277
max_depth: 10, n_estimators: 70, RMSE: 40.285
max_depth: 10, n_estimators: 80, RMSE: 40.210
max_depth: 10, n_estimators: 90, RMSE: 40.174
max_depth: 10, n_estimators: 100, RMSE: 40.250
max_depth: 10, n_estimators: 110, RMSE: 40.286
max_depth: 10, n_estimators: 120, RMSE: 40.315
max_depth: 10, n_estimators: 130, RMSE: 40.329
max_depth: 10, n_estimators: 140, RMSE: 40.300
max_depth: 10, n_estimators: 150, RMSE: 40.314
max_depth: 10, n_estimators: 160, RMSE: 40.354
max_depth: 10, n_estimators: 170, RMSE: 40.360
max_depth: 10, n_estimators: 180, RMSE: 40.364
max_depth: 10, n_estimators: 190, RMSE: 40.354
max_depth: 10, n_estimators: 200, RMSE: 40.325
Mean RMSE for max_depth 10: 40.392
max_depth: 15, n_estimators: 10, R

In [None]:
# Question 5

In [51]:
# model, train
model = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
model.fit(X_train, y_train)

# extracting feature importance
feature_importances = model.feature_importances_
features = dv.get_feature_names_out()
# save list in tuple
feature_importance_pairs = list(zip(features, feature_importances))
# sorting
feature_importance_pairs.sort(key=lambda x: x[1], reverse=True)
# print list
for feature, importance in feature_importance_pairs:
    print(f"{feature}: {importance:.4f}")

# getting the most important one
most_important_feature = max(importance_dict, key=importance_dict.get)
print(f"The most important feature is: {most_important_feature}")

study_hours_per_week: 0.2484
attendance_rate: 0.1497
distance_to_school: 0.1365
teacher_quality: 0.0827
age: 0.0693
assignments_completed: 0.0315
socioeconomic_status=high: 0.0257
parent_involvement=high: 0.0229
it_knowledge=high: 0.0177
parent_education_level=secondary: 0.0170
parent_education_level=primary: 0.0155
parent_education_level=tertiary: 0.0145
extra_tutorials=no: 0.0135
parent_involvement=low: 0.0134
it_knowledge=low: 0.0124
access_to_learning_materials=no: 0.0123
parent_involvement=medium: 0.0115
socioeconomic_status=low: 0.0107
socioeconomic_status=medium: 0.0106
gender=male: 0.0104
access_to_learning_materials=yes: 0.0103
school_location=rural: 0.0096
gender=female: 0.0093
school_location=urban: 0.0092
it_knowledge=medium: 0.0091
extra_tutorials=yes: 0.0091
school_type=private: 0.0090
school_type=public: 0.0084
parent_education_level: 0.0000
The most important feature is: study_hours_per_week


In [105]:
# Question 6

In [109]:
# transform data in dmatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

# list
watchlist = [(dtrain, 'train'), (dval, 'eval')]

# param for XGBoost-Model
xgb_params_1 = {
    'eta': 0.3,               # learning rate - controls step size for each boosting step
    'max_depth': 6,           # maximum depth of each tree
    'min_child_weight': 1,    # minimum weight threshold for a split to occur
    'objective': 'reg:squarederror', # objective function: minimize squared error (regression)
    'nthread': 8,             # number of threads (parallel processing)
    'seed': 1,                # random seed for reproducibility
    'verbosity': 1            # verbosity level for training output
}

# train eta=0.3
model_1 = xgb.train(params=xgb_params_1, dtrain=dtrain, num_boost_round=100, evals=watchlist, early_stopping_rounds=10)

# pred and rmse eta=0.3
y_pred_1 = model_1.predict(dval)
rmse_1 = np.sqrt(mean_squared_error(y_val, y_pred_1))
print(f"RMSE for eta=0.3: {rmse_1:.3f}")

# change eta=0.1 
xgb_params_2 = xgb_params_1.copy()
xgb_params_2['eta'] = 0.1

# train model eta=0.1
model_2 = xgb.train(params=xgb_params_2, dtrain=dtrain, num_boost_round=100, evals=watchlist, early_stopping_rounds=10)

# pred and rmse eta=0.1
y_pred_2 = model_2.predict(dval)
rmse_2 = np.sqrt(mean_squared_error(y_val, y_pred_2))
print(f"RMSE for eta=0.1: {rmse_2:.3f}")

# print best eta-option 
best_eta = 0.3 if rmse_1 < rmse_2 else 0.1
print(f"Best eta: {best_eta} with RMSE: {min(rmse_1, rmse_2):.3f}")

[0]	train-rmse:42.69384	eval-rmse:44.89114
[1]	train-rmse:39.83326	eval-rmse:43.07010
[2]	train-rmse:37.94542	eval-rmse:42.00332
[3]	train-rmse:36.56125	eval-rmse:41.46452
[4]	train-rmse:35.44252	eval-rmse:40.88896
[5]	train-rmse:34.57756	eval-rmse:40.69096
[6]	train-rmse:33.84230	eval-rmse:40.59315
[7]	train-rmse:33.25929	eval-rmse:40.47993
[8]	train-rmse:32.79415	eval-rmse:40.45326
[9]	train-rmse:32.16019	eval-rmse:40.43929
[10]	train-rmse:31.63404	eval-rmse:40.48319
[11]	train-rmse:31.17673	eval-rmse:40.68201
[12]	train-rmse:30.87313	eval-rmse:40.63522
[13]	train-rmse:30.30310	eval-rmse:40.70983
[14]	train-rmse:30.00098	eval-rmse:40.78133
[15]	train-rmse:29.41497	eval-rmse:40.86107
[16]	train-rmse:29.25816	eval-rmse:40.96580
[17]	train-rmse:28.59378	eval-rmse:41.12190
[18]	train-rmse:28.27990	eval-rmse:41.14360
RMSE for eta=0.3: 41.228
[0]	train-rmse:45.49999	eval-rmse:47.00533
[1]	train-rmse:44.12948	eval-rmse:45.92344
[2]	train-rmse:42.94858	eval-rmse:44.98366
[3]	train-rmse:41.90