<a href="https://colab.research.google.com/github/seshadrite/aimlexercises/blob/main/exercise2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
# Loan Default Prediction using KNN (Python / scikit-learn)
# --------------------------------------------------------
# This script:
# 1) Loads the given data
# 2) Encodes EmploymentType
# 3) Splits train/test
# 4) Scales features (critical for KNN)
# 5) Trains KNN
# 6) Evaluates model
# 7) Tries multiple K values to pick the best

import pandas as pd
from io import StringIO

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import requests
import io
github_csv_url = "https://raw.githubusercontent.com/seshadrite/aimlexercises/main/exercise2.csv"
try:
    response = requests.get(github_csv_url)
    response.raise_for_status()  # Raise an HTTPError for bad responses (4xx or 5xx)

    # Read the content into a pandas DataFrame
    df = pd.read_csv(io.StringIO(response.text))
    print("CSV loaded successfully!")
    print(df.head())
    # Perform NULL Checks
    df.info()
    df.isnull().sum()


except requests.exceptions.RequestException as e:
    print(f"Error fetching the CSV file: {e}")
    print("Please ensure the URL is correct and accessible.")
except pd.errors.EmptyDataError:
    print("Error: No data to parse. The CSV file might be empty.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")



# ---- 2) Encode EmploymentType (Salaried=0, Self-Employed=1) ----
df["EmploymentType"] = df["EmploymentType"].map({"Salaried": 0, "Self-Employed": 1})

# ---- 3) Split features/target ----
X = df.drop(columns=["loan"])
y = df["loan"]

# Stratify keeps class balance similar in train/test (important for tiny datasets)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ---- 4) Build Pipeline: Scaling + KNN ----
# Scaling is essential: different units (credit score vs loan amount) otherwise distort distances
knn_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("knn", KNeighborsClassifier(n_neighbors=3, metric="euclidean"))
])

# ---- 5) Train ----
knn_pipeline.fit(X_train, y_train)

# ---- 6) Evaluate ----
y_pred = knn_pipeline.predict(X_test)

print("KNN (k=3) Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# ---- 7) Try different K values (simple tuning) ----
# With 10 rows, we keep K small. In real projects, use cross-validation.
results = []
for k in range(1, 8, 2):  # odd K: 1,3,5,7
    model = Pipeline([
        ("scaler", StandardScaler()),
        ("knn", KNeighborsClassifier(n_neighbors=k))
    ])
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    acc = accuracy_score(y_test, pred)
    results.append((k, acc))

print("\nAccuracy by K:")
for k, acc in results:
    print(f"  k={k}: {acc:.3f}")

best_k, best_acc = max(results, key=lambda x: x[1])
print(f"\nBest K on this split: k={best_k} (accuracy={best_acc:.3f})")

# ---- 8) Predict for a new customer (example) ----
# Format must match training columns exactly:
new_customer = pd.DataFrame([{
    "Age": 40,
    "AnnualIncome(lakhs)": 11,
    "CreditScore(300-900)": 675,
    "LoanAmount(lakhs)": 9,
    "LoanTerm(years)": 10,
    "EmploymentType": 1  # Self-Employed
}])

prediction = knn_pipeline.predict(new_customer)[0]
proba = knn_pipeline.predict_proba(new_customer)[0]  # [P(no default), P(default)]

print("\nNew customer prediction:")
print("  Predicted loan(default=1):", prediction)
print("  Probabilities [P(0), P(1)]:", proba)


CSV loaded successfully!
   Experience (yrs)  Training Hours  Working Hours  Projects  \
0                 2              40             38         3   
1                 5              60             42         6   
2                 1              20             35         2   
3                 8              80             45         8   
4                 4              50             40         5   

   Productivity Score  
0                  62  
1                  78  
2                  55  
3                  88  
4                  72  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype
---  ------              --------------  -----
 0   Experience (yrs)    10 non-null     int64
 1   Training Hours      10 non-null     int64
 2   Working Hours       10 non-null     int64
 3   Projects            10 non-null     int64
 4   Productivity Score  10 non-null     int64
dtypes: int64(5)

KeyError: 'EmploymentType'