In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_auc_score, precision_recall_curve
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import matplotlib.pyplot as plt
import numpy as np

In [3]:
# Load dataset
df = pd.read_csv('filtered_student_data.csv', low_memory = False)

In [6]:
# Separate features and target variable
X = df.drop(['dropout', 'birth_certificate_id'], axis=1)
y = df['dropout']

# Identify categorical and numerical columns
categorical_cols = [col for col in X.columns if X[col].dtype == 'object']
numerical_cols = [col for col in X.columns if X[col].dtype in ['int64', 'float64']]

# Preprocessing for numerical columns
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with mean
    ('scaler', StandardScaler())])

# Preprocessing for categorical columns 
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with mode
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Column transformer to apply the different preprocessors to the respective columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)])

# Logistic Regression model with Elastic Net regularization
model = LogisticRegression(penalty='elasticnet', l1_ratio=0.5, C=1.0, random_state=0, solver='saga', max_iter=1000)

# Full pipeline including the preprocessor and the model
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', model)])

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Train model
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))




Accuracy: 0.8831941505201931
Classification Report:
               precision    recall  f1-score   support

           0       0.89      1.00      0.94    139169
           1       0.60      0.04      0.07     18657

    accuracy                           0.88    157826
   macro avg       0.74      0.52      0.50    157826
weighted avg       0.85      0.88      0.83    157826

Confusion Matrix:
 [[138728    441]
 [ 17994    663]]


In [7]:
# Extract and print coefficients for logistic regression to understand the relationship between variables and the outcome
feature_names = np.concatenate([numerical_cols, pipeline.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_cols)])
coefficients = pipeline.named_steps['classifier'].coef_[0]
for feature, coef in zip(feature_names, coefficients):
    print(f"{feature}: {coef}")

is_orphan: 0.023562177547759112
is_never_been_to_school: 0.0015766704818040089
is_ethnic: -0.24226975604326423
parents_income: -1.2959683246566769
previous_dropout: 0.0760707889219914
received_any_treatment: 0.006117634341644241
newly_admitted: 0.08871534307016046
sex_ : 0.0
sex_ F: -0.017676851583240966
sex_Boy: -0.10911891344099976
sex_D: 0.0
sex_F: -0.6519014667473751
sex_F : -0.03460103186810164
sex_FEMAL: 0.6353179917400971
sex_FF: 0.0
sex_Famel: -0.028988785516131967
sex_Female: -0.7026254444276242
sex_Female : 0.11900334888247631
sex_G: -0.015614774549530232
sex_Girl: -0.031690470975402334
sex_M: -0.4960204003582942
sex_M : 0.0
sex_MALE: 0.3692635595157447
sex_MF: -0.0037014381373854507
sex_Male: -1.529936458068198
sex_Male : 0.23713150824800788
sex_N: 0.10767706356592216
sex_O: 0.18038042283882547
sex_f: -0.24912114586341433
sex_m: -0.39135266574894473
father_educational_attainment_ : -0.17022775224090508
father_educational_attainment_    Five: -0.002119374646958172
father_educ

In [11]:
print(categorical_cols)
print(numerical_cols)

['birth_certificate_id', 'sex', 'father_educational_attainment', 'mother_educational_attainment', 'grade_id', 'relation_with_guardian', 'pwd_type', 'pwd_degree', 'marital_status']
['is_orphan', 'is_never_been_to_school', 'is_ethnic', 'parents_income', 'previous_dropout', 'received_any_treatment', 'newly_admitted']
