# AI-based Drop-Out Prediction and Counseling System

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
from google.colab import files
uploaded = files.upload()

# Then read the file
import pandas as pd
dataset = pd.read_csv('newDataset.csv')

In [None]:
dataset.head()

In [None]:
dataset.info()

In [None]:
dataset.describe()

In [None]:
print("Dropout rate:", dataset["dropout"].mean())

# Logistic Regression

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

X = dataset.drop(columns=["student_id", "dropout", "dropout_probability", "dropout_months_from_enrollment"])
y = dataset["dropout"]

X = pd.get_dummies(X, drop_first=True)  # handle categorical

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Initialize Decision Tree
dt_model = DecisionTreeClassifier(
    criterion="gini",   #or entropy
    max_depth=10,
    random_state=42,
    class_weight="balanced"
)

# Train
dt_model.fit(X_train, y_train)

# Predict
y_pred = dt_model.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:
# from sklearn.tree import plot_tree
# plot_tree(dt_model)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_estimator(dt_model , X_test , y_test)

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Initialize Random Forest
rf_model = RandomForestClassifier(
    n_estimators=1000,  # number of trees
    max_depth=10,      # prevent overfitting
    random_state=42,
    class_weight="balanced"  # handles dropout imbalance
)

# Train
rf_model.fit(X_train, y_train)

# Predict
y_pred = rf_model.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_estimator(rf_model , X_test , y_test)

In [None]:
import matplotlib.pyplot as plt

importances = rf_model.feature_importances_
features = X.columns

# Sort by importance
indices = importances.argsort()[::-1]

plt.figure(figsize=(10,6))
plt.bar(range(15), importances[indices][:15], align="center")
plt.xticks(range(15), [features[i] for i in indices[:15]], rotation=45, ha="right")
plt.title("Top 15 Features Influencing Dropout")
plt.show()

# XG Boost

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Initialize XGBoost
xgb_model = XGBClassifier(
    n_estimators=300,        # number of boosting rounds
    max_depth=6,             # tree depth
    learning_rate=0.1,       # step size shrinkage
    subsample=0.8,           # random sampling of training data
    colsample_bytree=0.8,    # random sampling of features
    random_state=42,
    scale_pos_weight=1       # handles class imbalance (can tune based on dropout ratio)
)

# Train
xgb_model.fit(X_train, y_train)

# Predict
y_pred = xgb_model.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:
# import lightgbm as lgb
# from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# # Initialize LightGBM Classifier
# lgb_model = lgb.LGBMClassifier(
#     n_estimators=1000,       # more boosting rounds
#     learning_rate=0.03,      # lower LR for better generalization
#     num_leaves=64,           # try 31, 64, 127
#     max_depth=10,            # limit tree depth
#     subsample=0.8,
#     colsample_bytree=0.8,
#     reg_alpha=0.1,           # L1 regularization
#     reg_lambda=0.1,          # L2 regularization
#     random_state=42,
#     class_weight="balanced"
# )

# # Train
# lgb_model.fit(X_train, y_train)

# # Predict
# y_pred = lgb_model.predict(X_test)

# # Evaluate
# print("Accuracy:", accuracy_score(y_test, y_pred))
# print("\nClassification Report:\n", classification_report(y_test, y_pred))
# print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Base model
lgb_model = lgb.LGBMClassifier(
    objective="binary",
    random_state=42,
    class_weight="balanced"
)

# Parameter grid
param_grid = {
    "num_leaves": [31, 63, 127],
    "max_depth": [5, 10, 15],
    "learning_rate": [0.1, 0.05, 0.01],
    "n_estimators": [200, 500, 1000],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.8, 1.0]
}

# GridSearchCV
grid_search = GridSearchCV(
    estimator=lgb_model,
    param_grid=param_grid,
    scoring="accuracy",
    cv=5,
    verbose=2,
    n_jobs=-1
)

# Fit model
grid_search.fit(X_train, y_train)

# Best parameters
print("Best Parameters:", grid_search.best_params_)

# Best model
best_lgb = grid_search.best_estimator_

# Predictions
y_pred = best_lgb.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Fitting 5 folds for each of 324 candidates, totalling 1620 fits


###  Build counseling rules

### Based on features, you can flag students at risk:
	### •	Low GPA + low attendance → Academic counseling
	### •	Financial hold + no scholarship → Financial counseling
	### •	Part-time job + high absences → Time management counseling

In [None]:
# Academic counseling
academic_risk = dataset[(dataset["cumulative_gpa"] < 5.0) & (dataset["attendance_rate_pct"] < 50)]

# Financial counseling
financial_risk = dataset[(dataset["financial_hold"] == 1) & (dataset["scholarship"] == 0)]

# Time management counseling
time_risk = dataset[(dataset["part_time_job"] == 1) & (dataset["attendance_rate_pct"] < 50)]

print("Students needing Academic Counseling:", len(academic_risk))
print("Students needing Financial Counseling:", len(financial_risk))
print("Students needing Time Management Counseling:", len(time_risk))

# Optional: unique counts (some students may fall into multiple categories)
total_unique = pd.concat([academic_risk, financial_risk, time_risk]).drop_duplicates()
print("Total unique students needing support:", len(total_unique))