In [None]:
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

# ==============================
# 1) Load data
# ==============================
df = pd.read_csv("data.csv")
df.columns = [c.strip() for c in df.columns]

X = df.drop(columns=["Target"])
y = df["Target"]

# ==============================
# 2) Define columns
# ==============================
cat_cols = [
    "Marital status","Application mode","Application order","Course",
    "Daytime/evening attendance","Previous qualification","Nacionality",
    "Mother's qualification","Father's qualification",
    "Mother's occupation","Father's occupation",
    "Displaced","Educational special needs","Debtor",
    "Tuition fees up to date","Gender","Scholarship holder","International"
]

num_cols = [c for c in X.columns if c not in cat_cols]

# ==============================
# 3) Preprocessing
# ==============================
numeric_tf = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_tf = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_tf, num_cols),
    ("cat", categorical_tf, cat_cols)
])

# ==============================
# 4) Model
# ==============================
rf = RandomForestClassifier(random_state=42)

pipeline = Pipeline([
    ("preprocess", preprocessor),
    ("model", rf)
])

# ==============================
# 5) Train / Test split
# ==============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# ==============================
# 6) Hyperparameter tuning
# ==============================
param_grid = {
    "model__n_estimators": [200, 300],
    "model__max_depth": [None, 20],
    "model__min_samples_split": [2, 5]
}

grid = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

grid.fit(X_train, y_train)

best_model = grid.best_estimator_

# ==============================
# 7) Evaluation
# ==============================
y_pred = best_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1-score:", f1_score(y_test, y_pred, average="weighted"))

# ==============================
# 8) Save model
# ==============================
joblib.dump(best_model, "final_model.pkl")

print("Model saved as final_model.pkl")
