In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Load the data
data = pd.read_csv('jamb_exam_results.csv')  # Update with your file path

# Selecting relevant features and target variable
features = ['Study_Hours_Per_Week', 'Attendance_Rate', 'Teacher_Quality', 'Distance_To_School']
target = 'JAMB_Score'

# Splitting data into train and validation sets
X = data[features]
y = data[target]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1)

# Question 1: Decision Tree with max_depth=1
dt_regressor = DecisionTreeRegressor(max_depth=1, random_state=1)
dt_regressor.fit(X_train, y_train)
split_feature_index = dt_regressor.tree_.feature[0]
split_feature = features[split_feature_index]
print("Question 1 - Feature used for splitting:", split_feature)

# Question 2: Random Forest Regressor with specified parameters
rf_regressor = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf_regressor.fit(X_train, y_train)
y_pred_val = rf_regressor.predict(X_val)
rmse_q2 = np.sqrt(mean_squared_error(y_val, y_pred_val))
print("Question 2 - RMSE:", round(rmse_q2, 2))

# Question 3: Experiment with n_estimators from 10 to 200
rmse_scores = {}
for n in range(10, 210, 10):
    rf_regressor = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf_regressor.fit(X_train, y_train)
    y_pred_val = rf_regressor.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))
    rmse_scores[n] = rmse
print("Question 3 - RMSE scores by n_estimators:", rmse_scores)

# Question 4: Finding best max_depth with varying n_estimators
mean_rmse_scores = {}
for depth in [10, 15, 20, 25]:
    rmse_values = []
    for n in range(10, 210, 10):
        rf_regressor = RandomForestRegressor(n_estimators=n, max_depth=depth, random_state=1, n_jobs=-1)
        rf_regressor.fit(X_train, y_train)
        y_pred_val = rf_regressor.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))
        rmse_values.append(rmse)
    mean_rmse_scores[depth] = np.mean(rmse_values)
print("Question 4 - Mean RMSE by max_depth:", mean_rmse_scores)

# Question 5: Most important feature from Random Forest
rf_regressor = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
rf_regressor.fit(X_train, y_train)
importances = rf_regressor.feature_importances_
most_important_feature = features[np.argmax(importances)]
print("Question 5 - Most important feature:", most_important_feature)


Question 1 - Feature used for splitting: Study_Hours_Per_Week
Question 2 - RMSE: 43.7
Question 3 - RMSE scores by n_estimators: {10: 43.704090960305614, 20: 43.04851905048051, 30: 42.83364041831483, 40: 42.40820026967277, 50: 42.30824089577979, 60: 42.2257491649391, 70: 42.18014291181805, 80: 42.329709067987054, 90: 42.29325717041695, 100: 42.24103202803394, 110: 42.294431447078914, 120: 42.250345344590414, 130: 42.23354460116254, 140: 42.265737055977716, 150: 42.28150262943436, 160: 42.25470024749987, 170: 42.2304964478161, 180: 42.27740769665238, 190: 42.295158951616045, 200: 42.30277083880091}
Question 4 - Mean RMSE by max_depth: {10: 41.194472318902584, 15: 41.964583666313686, 20: 42.383041078309674, 25: 42.4279517848748}
Question 5 - Most important feature: Distance_To_School


In [2]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

# Preparing DMatrix for train and validation data
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

# Watchlist for tracking training and validation error
watchlist = [(dtrain, 'train'), (dval, 'eval')]

# Function to train XGBoost with given eta and return RMSE
def train_xgboost(eta_value):
    xgb_params = {
        'eta': eta_value,
        'max_depth': 6,
        'min_child_weight': 1,
        'objective': 'reg:squarederror',
        'nthread': 8,
        'seed': 1,
        'verbosity': 1
    }
    model = xgb.train(
        xgb_params,
        dtrain,
        num_boost_round=100,
        evals=watchlist,
        early_stopping_rounds=10,
        verbose_eval=False
    )
    # Predict and calculate RMSE on validation data
    y_pred_val = model.predict(dval)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))
    return rmse

# Training with eta = 0.3
rmse_03 = train_xgboost(0.3)
print("RMSE with eta=0.3:", rmse_03)

# Training with eta = 0.1
rmse_01 = train_xgboost(0.1)
print("RMSE with eta=0.1:", rmse_01)

# Determining the best eta
if rmse_03 < rmse_01:
    print("Best eta: 0.3")
elif rmse_01 < rmse_03:
    print("Best eta: 0.1")
else:
    print("Both eta values give equal RMSE")


RMSE with eta=0.3: 41.7023388950155
RMSE with eta=0.1: 41.04660679062348
Best eta: 0.1
