In [41]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split

In [42]:
# Load Students_Performance_data_set.xlsx file
path = r"C:\Users\ASUS\Desktop\Campus\3rd Year\second\Nature Inspired Algorithms\Dataset\Students_Performance_data_set.xlsx"
df = pd.read_excel(path)
df.head()

Unnamed: 0,University Admission year,Gender,Age,H.S.C passing year,Program,Current Semester,Do you have meritorious scholarship ?,Do you use University transportation?,How many hour do you study daily?,How many times do you seat for study in a day?,...,What is you interested area?,What is your relationship status?,Are you engaged with any co-curriculum activities?,With whom you are living with?,Do you have any health issues?,What was your previous SGPA?,Do you have any physical disabilities?,What is your current CGPA?,How many Credit did you have completed?,What is your monthly family income?
0,2018,Male,24,2016,BCSE,12,Yes,No,3,2,...,Data Schince,Single,Yes,Bachelor,No,2.68,No,3.15,75,25000
1,2021,Male,22,2020,BCSE,4,Yes,Yes,3,2,...,Event management,Single,Yes,Family,No,2.68,No,3.15,36,100000
2,2020,Female,21,2019,BCSE,5,No,No,3,3,...,Software,Single,No,Bachelor,No,2.68,No,3.15,50,50000
3,2021,Male,20,2020,BCSE,4,Yes,No,1,3,...,Artificial Intelligence,Single,No,Bachelor,Yes,2.68,No,3.15,36,62488
4,2021,Male,22,2019,BCSE,4,Yes,No,3,1,...,Software,Relationship,No,Bachelor,Yes,2.68,No,3.15,36,50000


In [43]:
# Identify target and feature
target_col = 'What is your current CGPA?'
one_hot_col = 'Status of your English language proficiency'

In [44]:
# Separate column types
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
numerical_cols.remove(target_col)
categorical_cols.remove(one_hot_col)

In [45]:
# Impute missing values
df[numerical_cols] = SimpleImputer(strategy='median').fit_transform(df[numerical_cols])
df[categorical_cols + [one_hot_col]] = SimpleImputer(strategy='most_frequent').fit_transform(df[categorical_cols + [one_hot_col]])

In [46]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# Make sure all values in categorical columns are strings
for col in categorical_cols:
    df[col] = df[col].astype(str)
    df[col] = LabelEncoder().fit_transform(df[col])

# Make sure one_hot_col is string type too (if not already)
df[one_hot_col] = df[one_hot_col].astype(str)
df = pd.get_dummies(df, columns=[one_hot_col], prefix="English")


In [47]:
# Normalize
scaler = MinMaxScaler()
df[numerical_cols + [target_col]] = scaler.fit_transform(df[numerical_cols + [target_col]])

### Step 4: Train-Test Split

In [48]:
X = df.drop(columns=[target_col])
y = df[target_col]
X_train, X_temp, y_train, y_temp = train_test_split(X, y,
test_size=0.30, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp,
test_size=0.50, random_state=42)

### Step 5: Model Training and Evaluation using XGBoost, LightGBM, MLP, Random Forest, SVM


In [49]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

def evaluate_model(name, y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)

    print(f"{name} => MSE: {mse:.4f}, RMSE: {rmse:.4f}, R²: {r2:.4f}")

    return {
        "Model": name,
        "MSE": mse,
        "RMSE": rmse,
        "R2": r2
    }


### Train Models

In [54]:
from sklearn.neural_network import MLPRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
results = []


In [55]:
# XGBoost
xgb_model = xgb.XGBRegressor(n_estimators=100, max_depth=4)
xgb_model.fit(X_train, y_train)
results.append(evaluate_model("XGBoost", y_test,
xgb_model.predict(X_test)))


XGBoost => MSE: 0.0103, RMSE: 0.1015, R²: 0.6737


In [56]:
# LightGBM
lgb_model = lgb.LGBMRegressor(n_estimators=200, learning_rate=0.05)
lgb_model.fit(X_train, y_train)
results.append(evaluate_model("LightGBM", y_test,
lgb_model.predict(X_test)))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000187 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 503
[LightGBM] [Info] Number of data points in the train set: 835, number of used features: 30
[LightGBM] [Info] Start training from score 0.788756
LightGBM => MSE: 0.0095, RMSE: 0.0974, R²: 0.6995


In [57]:
# MLP
mlp_model = MLPRegressor(hidden_layer_sizes=(64,), max_iter=1000)
mlp_model.fit(X_train, y_train)
results.append(evaluate_model("MLP", y_test,
mlp_model.predict(X_test)))


MLP => MSE: 0.0193, RMSE: 0.1391, R²: 0.3874


In [58]:
# Random Forest
rf_model = RandomForestRegressor(n_estimators=100)
rf_model.fit(X_train, y_train)
results.append(evaluate_model("Random Forest", y_test,
rf_model.predict(X_test)))

Random Forest => MSE: 0.0095, RMSE: 0.0974, R²: 0.6993


In [59]:
# SVM
svm_model = SVR(kernel='rbf')
svm_model.fit(X_train, y_train)
results.append(evaluate_model("SVM", y_test,
svm_model.predict(X_test)))


SVM => MSE: 0.0297, RMSE: 0.1722, R²: 0.0610


### Step 6: Summary Table of Results

In [61]:
# Summary Table
pd.DataFrame(results)

Unnamed: 0,Model,MSE,RMSE,R2
0,XGBoost,0.010305,0.101514,0.673696
1,LightGBM,0.009491,0.097421,0.699479
2,MLP,0.019347,0.139095,0.387375
3,Random Forest,0.009496,0.09745,0.6993
4,SVM,0.029656,0.172208,0.060968
