In [1]:
# =========================
# Compute & Save Best Threshold (F1)
# =========================
import os, csv, json, pickle, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import tensorflow as tf

# Paths
DATA_MATHS = r"C:\Users\sagni\Downloads\Dynamic Curriculum Designer\archive (1)\Maths.csv"
DATA_PORT  = r"C:\Users\sagni\Downloads\Dynamic Curriculum Designer\archive (1)\Portuguese.csv"
OUTPUT_DIR = r"C:\Users\sagni\Downloads\Dynamic Curriculum Designer"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Detect Excel disguised as CSV
def is_zip_or_xlsx(path):
    try:
        with open(path, "rb") as f:
            return f.read(2) == b"PK"
    except Exception:
        return False

def robust_read_any(path):
    if not os.path.exists(path):
        print(f"[ERROR] Missing file: {path}")
        return None
    if is_zip_or_xlsx(path):
        import openpyxl
        return pd.read_excel(path, engine="openpyxl")
    encodings = ["utf-8","utf-8-sig","cp1252","latin1"]
    delimiters = ["; ", ";", ",", "\t", "|"]
    try:
        with open(path, "rb") as f:
            head = f.read(4096).decode("latin1", errors="ignore")
        try:
            sniff = csv.Sniffer().sniff(head)
            if sniff.delimiter in delimiters:
                delimiters = [sniff.delimiter] + [d for d in delimiters if d != sniff.delimiter]
        except Exception:
            pass
    except Exception:
        pass
    for enc in encodings:
        for sep in delimiters:
            try:
                df = pd.read_csv(path, encoding=enc, sep=sep, engine="python")
                if df.shape[1] > 1:
                    return df
            except Exception:
                continue
    raise RuntimeError(f"Could not parse {path}")

# Load & combine
df_m = robust_read_any(DATA_MATHS)
df_p = robust_read_any(DATA_PORT)
common_cols = sorted(set(df_m.columns).intersection(set(df_p.columns)))
df = pd.concat([df_m[common_cols], df_p[common_cols]], ignore_index=True)

# Build target
df = df.dropna(subset=["G3"]).copy()
df["target"] = (pd.to_numeric(df["G3"], errors="coerce") >= 10).astype(int)
for c in ["G1","G2","G3"]:
    if c in df.columns: df.drop(columns=[c], inplace=True)

# Split columns as per dtype
cat_cols = [c for c in df.columns if df[c].dtype == "object" and c != "target"]
num_cols = [c for c in df.columns if c not in cat_cols + ["target"]]
for c in num_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce")
df = df.dropna(subset=num_cols)

X = df[cat_cols + num_cols]
y = df["target"].astype(int).values

# Same split seed as earlier
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Load preprocessor and align columns
with open(os.path.join(OUTPUT_DIR, "preprocessor.pkl"), "rb") as f:
    preproc = pickle.load(f)
cat_cols_expected = list(preproc.transformers_[0][2])
num_cols_expected = list(preproc.transformers_[1][2])
expected = cat_cols_expected + num_cols_expected

for col in expected:
    if col not in X.columns: X[col] = np.nan
X = X[expected]
X_test = X.loc[X_test.index]
X_test_proc = preproc.transform(X_test)

# Load model
try:
    model = tf.keras.models.load_model(os.path.join(OUTPUT_DIR, "model.keras"))
except Exception:
    model = tf.keras.models.load_model(os.path.join(OUTPUT_DIR, "model.h5"))

# Predict probabilities
y_prob = model.predict(X_test_proc).ravel()

# Search thresholds
candidates = np.linspace(0.1, 0.9, 81)  # 0.10 .. 0.90 step 0.01
best_t, best_f1 = 0.5, -1.0
for t in candidates:
    y_hat = (y_prob >= t).astype(int)
    f1 = f1_score(y_test, y_hat, zero_division=0)
    if f1 > best_f1:
        best_f1, best_t = f1, float(t)

with open(os.path.join(OUTPUT_DIR, "threshold.json"), "w", encoding="utf-8") as f:
    json.dump({"best_threshold": best_t, "best_f1": float(best_f1)}, f, indent=2)

print(f"[INFO] Saved threshold.json with best_threshold={best_t:.2f}, best_f1={best_f1:.4f}")


[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[INFO] Saved threshold.json with best_threshold=0.35, best_f1=0.8981
