In [13]:
import pandas as pd
pd.set_option("display.max_columns",None)

import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from mrmr import mrmr_regression
from xgboost import XGBRegressor

def dataPreparation(df):
    X  = df.drop(columns=['StudentID', 'GPA', 'GradeClass'])
    y  = df['GPA']

    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)
    feature_list = mrmr_regression(X_train,y_train,K=8)
    print(f"Feature yang berpengaruh terhadap GPA :{','.join(feature_list)}")
    X_train  = X_train[feature_list]
    X_test   = X_test[feature_list]
    return X_train, X_test, y_train, y_test   

def processingPipeline():
    pipeline = Pipeline(steps=[
        ('preprocessor', MinMaxScaler()),
        ('regressor', XGBRegressor(random_state=42))
    ])

    return pipeline

def randomizedSearch(pipeline, X_train, y_train):
   
    param_distributions = {
        'regressor__max_depth': [3, 4, 5, 6,7,8,9,10],
        'regressor__learning_rate': [0.001, 0.01, 0.1],
    }

   
    random_search = RandomizedSearchCV(
        estimator=pipeline,
        param_distributions=param_distributions,
        n_iter=10,
        scoring='neg_mean_squared_error',
        cv=5,
        verbose=1,
        random_state=42
    )

  
    random_search.fit(X_train, y_train)

    print(f"Best Parameters: {random_search.best_params_}")
    print(f"Best Score: {random_search.best_score_}")

    return random_search.best_estimator_

def eval(model,X_train,X_test,y_train,y_test):
    pred_train = model.predict(X_train)
    pred_test  = model.predict(X_test)

    print(f"R2 score Train -> {r2_score(y_train,pred_train)}")
    print(f"RMSE Train -> {mean_squared_error(np.exp(y_train),np.exp(pred_train))}\n")
    print(f"R2 score test -> {r2_score(y_test,pred_test)}")
    print(f"RMSE Train -> {mean_squared_error(np.exp(y_test),np.exp(pred_test))}")

In [14]:
df = pd.read_csv(r"C:\Users\asus\Documents\GitHub\tugas_day27\artifacts\Student_performance_data _.csv")
df.describe()

Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
count,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0
mean,2196.5,16.468645,0.51087,0.877508,1.746237,9.771992,14.541388,0.301421,2.122074,0.383361,0.303512,0.196906,0.157191,1.906186,2.983696
std,690.655244,1.123798,0.499986,1.028476,1.000411,5.652774,8.467417,0.458971,1.122813,0.486307,0.45987,0.397744,0.364057,0.915156,1.233908
min,1001.0,15.0,0.0,0.0,0.0,0.001057,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1598.75,15.0,0.0,0.0,1.0,5.043079,7.0,0.0,1.0,0.0,0.0,0.0,0.0,1.174803,2.0
50%,2196.5,16.0,1.0,0.0,2.0,9.705363,15.0,0.0,2.0,0.0,0.0,0.0,0.0,1.893393,4.0
75%,2794.25,17.0,1.0,2.0,2.0,14.40841,22.0,1.0,3.0,1.0,1.0,0.0,0.0,2.622216,4.0
max,3392.0,18.0,1.0,3.0,4.0,19.978094,29.0,1.0,4.0,1.0,1.0,1.0,1.0,4.0,4.0


In [15]:
X_train, X_test, y_train, y_test = dataPreparation(df)

100%|██████████| 8/8 [00:00<00:00,  9.07it/s]

Feature yang berpengaruh terhadap GPA :Absences,ParentalSupport,Tutoring,StudyTimeWeekly,Extracurricular,Music,Sports,Ethnicity





In [16]:
pipeline = processingPipeline()
best_model = randomizedSearch(pipeline, X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Parameters: {'regressor__max_depth': 3, 'regressor__learning_rate': 0.1}
Best Score: -0.04364841565698766


In [17]:
eval(best_model, X_train, X_test, y_train, y_test)

R2 score Train -> 0.962184775212753
RMSE Train -> 5.574898982894305

R2 score test -> 0.9464208915416917
RMSE Train -> 7.745236600072133


In [10]:
import pickle
with open('best_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)
