In [None]:
# =========================
# KNN baseline + Viz + tuning + metrics + submission
# =========================

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, log_loss, confusion_matrix
)

# -------------------------
# 1) LOAD
# -------------------------
train = pd.read_csv("/kaggle/input/mock-test-2-mse-2/train.csv")
test  = pd.read_csv("/kaggle/input/mock-test-2-mse-2/test.csv")

print("Train shape:", train.shape)
print("Test shape :", test.shape)

# -------------------------
# 2) Quick Data Visualisation (light)
# -------------------------
# target distribution
plt.figure(figsize=(6,3))
sns.countplot(x=train['Status'])
plt.title("Target distribution")
plt.show()

# numeric overview & small correlation heatmap
num_cols = train.select_dtypes(include=['int64','float64']).columns.tolist()
if len(num_cols) > 0:
    plt.figure(figsize=(8,6))
    sns.heatmap(train[num_cols].corr(), cmap="coolwarm", annot=False)
    plt.title("Numeric feature correlation (small)")
    plt.show()

    # show hist of first up to 4 numeric columns (keeps it light)
    for col in num_cols[:4]:
        plt.figure(figsize=(6,2.5))
        sns.histplot(train[col].dropna(), kde=True)
        plt.title(f"Hist: {col}")
        plt.show()

# simple outlier-check (boxplots for first 4 numeric cols)
for col in num_cols[:4]:
    plt.figure(figsize=(6,2.5))
    sns.boxplot(x=train[col].dropna())
    plt.title(f"Outlier check (visual): {col}")
    plt.show()

# -------------------------
# 3) MISSING VALUES (same safe strategy, impute)
# -------------------------
# categorical columns except target
cat_cols = train.select_dtypes(include=['object']).columns.tolist()
if 'Status' in cat_cols:
    cat_cols.remove('Status')

# numeric columns recalculated
num_cols = train.select_dtypes(include=['int64','float64']).columns.tolist()

# fill categorical with mode, numeric with mean (train→test safe)
for c in cat_cols:
    train[c] = train[c].fillna(train[c].mode()[0])
    if c in test.columns:
        test[c]  = test[c].fillna(test[c].mode()[0])

for c in num_cols:
    train[c] = train[c].fillna(train[c].mean())
    if c in test.columns:
        test[c]  = test[c].fillna(test[c].mean())

# -------------------------
# 4) ENCODING target + safe label-encoding for categorical features
# -------------------------
target_le = LabelEncoder()
y = target_le.fit_transform(train['Status'])              # numeric labels for metrics

# For categorical features: safe label-encode using train+test values (keeps mapping consistent)
from sklearn.preprocessing import OrdinalEncoder
# We will use OrdinalEncoder to transform categorical columns to integer codes for KNN,
# but we must ensure unseen categories in test are handled — convert to string and fit on concat.
for c in cat_cols:
    # convert to string to avoid issues with mixed types
    combined = pd.concat([train[c].astype(str), test[c].astype(str)], axis=0)
    enc = LabelEncoder()
    enc.fit(combined)
    train[c] = enc.transform(train[c].astype(str))
    test[c]  = enc.transform(test[c].astype(str))

# -------------------------
# 5) FEATURES & SPLIT
# -------------------------
X = train.drop(columns=['Status']).copy()
# ensure test_features match X columns (drop id if present)
test_features = test.copy()
if 'id' in X.columns:
    X = X.drop(columns=['id'])
if 'id' in test_features.columns:
    test_features = test_features.drop(columns=['id'])

# train/val split (use encoded y)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

# -------------------------
# 6) FEATURE SCALING (important for KNN)
# -------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled   = scaler.transform(X_val)
# scale test features using same scaler (align columns)
test_scaled = scaler.transform(test_features[X.columns])

# -------------------------
# 7) BASE KNN model + SMALL hyperparameter tuning (light & safe)
#    We tune for NEG_LOG_LOSS to improve probability quality.
# -------------------------
knn = KNeighborsClassifier()

param_dist = {
    "n_neighbors": [3,5,7,10,15,20,30],
    "weights": ["uniform","distance"],
    "p": [1,2]   # 1 = manhattan, 2 = euclidean
}

rs = RandomizedSearchCV(
    knn,
    param_distributions=param_dist,
    n_iter=8,        # keep small so it finishes
    cv=3,
    scoring='neg_log_loss',   # tune probabilities
    n_jobs=-1,
    random_state=42,
    verbose=0
)

rs.fit(X_train_scaled, y_train)
best_knn = rs.best_estimator_
print("Best KNN params:", rs.best_params_)

# -------------------------
# 8) EVALUATION on validation set (all requested metrics)
# -------------------------
y_val_pred = best_knn.predict(X_val_scaled)
y_val_proba = best_knn.predict_proba(X_val_scaled)

acc = accuracy_score(y_val, y_val_pred)
prec = precision_score(y_val, y_val_pred, average='weighted', zero_division=0)
rec  = recall_score(y_val, y_val_pred, average='weighted', zero_division=0)
f1   = f1_score(y_val, y_val_pred, average='weighted', zero_division=0)
# roc_auc for multiclass requires probability estimates
roc_auc = roc_auc_score(y_val, y_val_proba, multi_class='ovr')
ll = log_loss(y_val, y_val_proba)

print("\nValidation metrics (KNN tuned):")
print("Accuracy :", acc)
print("Precision:", prec)
print("Recall   :", rec)
print("F1 Score :", f1)
print("ROC-AUC  :", roc_auc)
print("Log Loss :", ll)

# confusion matrix (visual)
cm = confusion_matrix(y_val, y_val_pred)
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion matrix (validation)")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# -------------------------
# 9) FINAL PREDICTIONS for submission (probabilities)
# -------------------------
final_proba = best_knn.predict_proba(test_scaled)

# columns should be Status_{class} in the same order as target_le.classes_
cols = [f"Status_{cls}" for cls in target_le.classes_]

submission = pd.DataFrame(final_proba, columns=cols)
# id column from original test
if 'id' in test.columns:
    submission.insert(0, 'id', test['id'])
else:
    submission.insert(0, 'id', np.arange(len(test)))

submission.to_csv("/kaggle/working/Answer_with_metrics.csv", index=False)
print("\nSaved submission to /kaggle/working/Answer_with_metrics.csv")
submission.head()