In [None]:
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score,accuracy_score,recall_score,precision_score,f1_score,classification_report
from imblearn.over_sampling import SMOTE



df = pd.read_csv('StudentsPerformance.csv')

df.rename(columns={
    'race/ethnicity': 'Race',
    'parental level of education': 'Education',
    'test preparation course': 'Preparation'
}, inplace=True)

df.rename(columns={'race/ethnicity': 'Race'}, inplace=True)

df = pd.get_dummies(df, columns=['Race'])
df['Preparation'] = np.where(df['Preparation'].str.contains("none"),0,1)
df['lunch'] = df['lunch'].map({'standard': 0, 'free/reduced' : 1})
education_order = {
    "some high school": 0,
    "high school": 1,
    "some college": 2,
    "associate's degree": 3,
    "bachelor's degree": 4,
    "master's degree": 5
}

# Apply mapping
df['Education'] = df['Education'].map(education_order)
df['gender'] = df['gender'].map({'female': 0, 'male': 1})
race_cols = [col for col in df.columns if col.startswith('Race_')]
df[race_cols] = df[race_cols].astype(int)
#dependent and independent features
x = df.drop(columns=['Preparation'])
y = df['Preparation']
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)
# train test split
x_train,x_test,y_train,y_test = train_test_split(x_scaled,y,test_size=0.25,random_state=42)


smote = SMOTE(random_state=42)
x_train_bal, y_train_bal = smote.fit_resample(x_train, y_train)

print("Before SMOTE:", np.bincount(y_train))
print("After SMOTE:", np.bincount(y_train_bal))

param_grid = {
    'C' : [0.01,0.1,1,10,100],
    'penalty': ['l1','l2'],
    'solver': ['liblinear','saga']
}
logreg = LogisticRegression(max_iter=5000)
grid_search = GridSearchCV(estimator=logreg,param_grid=param_grid,cv=5,scoring='accuracy')

grid_search.fit(x_train_bal,y_train_bal)

best_model = grid_search.best_estimator_

y_pred = best_model.predict(x_train)

print("Accuracy:", accuracy_score(y_train, y_pred))
print("Precision:", precision_score(y_train, y_pred))
print("Recall:", recall_score(y_train, y_pred))
print("F1 Score:", f1_score(y_train, y_pred))
print("\nClassification Report:\n", classification_report(y_train, y_pred))

r2_score = r2_score(y_train,y_pred)



Before SMOTE: [486 264]
After SMOTE: [486 486]
Accuracy: 0.7386666666666667
Precision: 0.6752577319587629
Recall: 0.4962121212121212
F1 Score: 0.5720524017467249

Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.87      0.81       486
           1       0.68      0.50      0.57       264

    accuracy                           0.74       750
   macro avg       0.72      0.68      0.69       750
weighted avg       0.73      0.74      0.73       750

-0.14571642349420144
