<a href="https://colab.research.google.com/github/seshadrite/aimlexercises/blob/main/exercise2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Loan Default Prediction using KNN (Python / scikit-learn)
# --------------------------------------------------------
# This script:
# 1) Loads the given data
# 2) Encodes EmploymentType
# 3) Splits train/test
# 4) Scales features (critical for KNN)
# 5) Trains KNN
# 6) Evaluates model
# 7) Tries multiple K values to pick the best

import pandas as pd
from io import StringIO

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# ---- 1) Create DataFrame from your dataset ----
raw = """Age,AnnualIncome(lakhs),CreditScore(300-900),LoanAmount(lakhs),LoanTerm(years),EmploymentType,loan
28,6.5,720,5,5,Salaried,0
45,12,680,10,10,Self-Employed,1
35,8,750,6,7,Salaried,0
50,15,640,12,15,Self-Employed,1
30,7,710,5,5,Salaried,0
42,10,660,9,10,Salaried,1
26,5.5,730,4,4,Salaried,0
48,14,650,11,12,Self-Employed,1
38,9,700,7,8,Salaried,0
55,16,620,13,15,Self-Employed,1
"""
df = pd.read_csv(StringIO(raw))

# ---- 2) Encode EmploymentType (Salaried=0, Self-Employed=1) ----
df["EmploymentType"] = df["EmploymentType"].map({"Salaried": 0, "Self-Employed": 1})

# ---- 3) Split features/target ----
X = df.drop(columns=["loan"])
y = df["loan"]

# Stratify keeps class balance similar in train/test (important for tiny datasets)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ---- 4) Build Pipeline: Scaling + KNN ----
# Scaling is essential: different units (credit score vs loan amount) otherwise distort distances
knn_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("knn", KNeighborsClassifier(n_neighbors=3, metric="euclidean"))
])

# ---- 5) Train ----
knn_pipeline.fit(X_train, y_train)

# ---- 6) Evaluate ----
y_pred = knn_pipeline.predict(X_test)

print("KNN (k=3) Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# ---- 7) Try different K values (simple tuning) ----
# With 10 rows, we keep K small. In real projects, use cross-validation.
results = []
for k in range(1, 8, 2):  # odd K: 1,3,5,7
    model = Pipeline([
        ("scaler", StandardScaler()),
        ("knn", KNeighborsClassifier(n_neighbors=k))
    ])
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    acc = accuracy_score(y_test, pred)
    results.append((k, acc))

print("\nAccuracy by K:")
for k, acc in results:
    print(f"  k={k}: {acc:.3f}")

best_k, best_acc = max(results, key=lambda x: x[1])
print(f"\nBest K on this split: k={best_k} (accuracy={best_acc:.3f})")

# ---- 8) Predict for a new customer (example) ----
# Format must match training columns exactly:
new_customer = pd.DataFrame([{
    "Age": 40,
    "AnnualIncome(lakhs)": 11,
    "CreditScore(300-900)": 675,
    "LoanAmount(lakhs)": 9,
    "LoanTerm(years)": 10,
    "EmploymentType": 1  # Self-Employed
}])

prediction = knn_pipeline.predict(new_customer)[0]
proba = knn_pipeline.predict_proba(new_customer)[0]  # [P(no default), P(default)]

print("\nNew customer prediction:")
print("  Predicted loan(default=1):", prediction)
print("  Probabilities [P(0), P(1)]:", proba)
