In [4]:
import os
import json
import yaml
import pickle
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

import pyswarms as ps

# ============================================================
# PATHS
# ============================================================
BASE_DIR = r"C:\Users\NXTWAVE\Downloads\Student Dropout Risk Prediction"
DATA_PATH = r"C:\Users\NXTWAVE\Downloads\Student Dropout Risk Prediction\RS_Session_254_AU_352.B.csv"

MODEL_DIR = os.path.join(BASE_DIR, "models")
ARTIFACT_DIR = os.path.join(BASE_DIR, "artifacts")
LOG_DIR = os.path.join(BASE_DIR, "logs")

os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(ARTIFACT_DIR, exist_ok=True)
os.makedirs(LOG_DIR, exist_ok=True)

# ============================================================
# LOAD DATA
# ============================================================
df = pd.read_csv(DATA_PATH)
df.columns = df.columns.str.lower()

# Drop non-numeric identifier columns
df = df.drop(columns=["sl. no.", "state/ut"], errors="ignore")

# ============================================================
# SAFETY: REPLACE ZERO WITH NaN (to avoid division by zero)
# ============================================================
df.replace(0, np.nan, inplace=True)

# ============================================================
# CREATE DROPOUT RISK (SAFE CALCULATION)
# ============================================================
df["drop_primary_to_upper"] = (
    df["primary level (i - v) - total"] -
    df["upper primary - total"]
) / df["primary level (i - v) - total"]

df["drop_upper_to_secondary"] = (
    df["upper primary - total"] -
    df["secondary (ix-x) - total"]
) / df["upper primary - total"]

df["drop_secondary_to_higher"] = (
    df["secondary (ix-x) - total"] -
    df["higher secondary - total"]
) / df["secondary (ix-x) - total"]

# ============================================================
# FINAL DROPOUT RISK SCORE (0‚Äì1)
# ============================================================
df["dropout_risk"] = df[
    [
        "drop_primary_to_upper",
        "drop_upper_to_secondary",
        "drop_secondary_to_higher"
    ]
].mean(axis=1)

# ============================================================
# CLEAN NaN & INF VALUES
# ============================================================
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Fill NaN with column median (best practice)
df.fillna(df.median(numeric_only=True), inplace=True)

# Clip risk into valid range
df["dropout_risk"] = df["dropout_risk"].clip(0, 1)

# ============================================================
# FEATURES & TARGET
# ============================================================
X = df.drop(columns=["dropout_risk"])
y = df["dropout_risk"]

# ============================================================
# SCALING (NOW SAFE)
# ============================================================
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

with open(os.path.join(ARTIFACT_DIR, "scaler.pkl"), "wb") as f:
    pickle.dump(scaler, f)

# ============================================================
# LSTM SHAPE
# ============================================================
X_lstm = X_scaled.reshape(X_scaled.shape[0], 1, X_scaled.shape[1])

X_train, X_test, y_train, y_test = train_test_split(
    X_lstm, y, test_size=0.2, random_state=42
)

# ============================================================
# LSTM MODEL
# ============================================================
def build_lstm(units, dropout):
    model = Sequential([
        LSTM(int(units), input_shape=(1, X_train.shape[2])),
        Dropout(dropout),
        Dense(1)
    ])
    model.compile(optimizer="adam", loss="mse")
    return model

# ============================================================
# PSO OBJECTIVE FUNCTION
# ============================================================
def pso_objective(particles):
    losses = []

    for p in particles:
        units = int(p[0])
        dropout = p[1]

        model = build_lstm(units, dropout)
        model.fit(
            X_train, y_train,
            epochs=10,
            batch_size=32,
            verbose=0
        )

        loss = model.evaluate(X_test, y_test, verbose=0)
        losses.append(loss)

    return np.array(losses)

# ============================================================
# PSO OPTIMIZATION
# ============================================================
bounds = (
    np.array([32, 0.1]),
    np.array([128, 0.5])
)

optimizer = ps.single.GlobalBestPSO(
    n_particles=8,
    dimensions=2,
    options={"c1": 1.5, "c2": 1.5, "w": 0.7},
    bounds=bounds
)

best_loss, best_pos = optimizer.optimize(pso_objective, iters=5)

best_units = int(best_pos[0])
best_dropout = float(best_pos[1])

# ============================================================
# FINAL MODEL TRAINING
# ============================================================
final_model = build_lstm(best_units, best_dropout)

early_stop = EarlyStopping(patience=5, restore_best_weights=True)

final_model.fit(
    X_train, y_train,
    epochs=50,
    batch_size=32,
    validation_split=0.1,
    callbacks=[early_stop],
    verbose=1
)

final_model.save(os.path.join(MODEL_DIR, "dropout_lstm_pso.h5"))

# ============================================================
# EVALUATION
# ============================================================
y_pred = final_model.predict(X_test).flatten()

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# ============================================================
# SAVE RESULTS
# ============================================================
results = {
    "mse": float(mse),
    "r2_score": float(r2),
    "best_units": best_units,
    "best_dropout": best_dropout
}

with open(os.path.join(ARTIFACT_DIR, "results.json"), "w") as f:
    json.dump(results, f, indent=4)

with open(os.path.join(ARTIFACT_DIR, "config.yaml"), "w") as f:
    yaml.dump(results, f)

pd.DataFrame({
    "actual_risk": y_test.values,
    "predicted_risk": y_pred
}).to_csv(os.path.join(LOG_DIR, "predictions.csv"), index=False)

print("\n‚úÖ DROPOUT RISK MODEL TRAINED SUCCESSFULLY")
print("üìÅ Outputs saved in:", BASE_DIR)


2026-01-03 10:15:31,028 - pyswarms.single.global_best - INFO - Optimize for 5 iters with {'c1': 1.5, 'c2': 1.5, 'w': 0.7}
pyswarms.single.global_best:   0%|                                                                                              |0/5

























pyswarms.single.global_best: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà|5/5, best_cost=0.125
2026-01-03 10:17:38,647 - pyswarms.single.global_best - INFO - Optimization finished | best cost: 0.1246049553155899, best pos: [74.61838452  0.27197388]


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50


  saving_api.save_model(



‚úÖ DROPOUT RISK MODEL TRAINED SUCCESSFULLY
üìÅ Outputs saved in: C:\Users\NXTWAVE\Downloads\Student Dropout Risk Prediction
