In [6]:
import pandas as pd
import numpy as np
import torch

from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.metrics import (
    roc_auc_score, 
    f1_score, 
    precision_score, 
    recall_score,
    balanced_accuracy_score,
    confusion_matrix,
    classification_report,
    average_precision_score
)

from merlin.algorithms.kernels import FidelityKernel

torch.manual_seed(0)
np.random.seed(0)

FEATURE_COLS = [
    'income',
    'credit_utilization',
    'payment_history',
    'num_open_accounts',
    'debt_to_income',
    'loan_amount',
]

# -----------------------------
# 1. IO helpers
# -----------------------------
def load_data(file_path: str) -> pd.DataFrame:
    print(f"Loading data from {file_path}")
    return pd.read_csv(file_path)

def save_cleaned_data(X: np.ndarray, output_path: str) -> None:
    print(f"\nSaving cleaned data to {output_path}")
    df_out = pd.DataFrame(X, columns=FEATURE_COLS)
    df_out.to_csv(output_path, index=False)
    print("Done.")

# -----------------------------
# 2. Preprocessing
# -----------------------------
def preprocess_train(df: pd.DataFrame):
    # assume: col 0 = ID, cols 1:-1 = features, last col = label
    X_train_raw = df.iloc[:, 1:-1].values
    y_train = df.iloc[:, -1].values

    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train_raw)

    print("Train preprocessing complete.")
    return X_train, y_train, scaler

def preprocess_test(df: pd.DataFrame, scaler: MinMaxScaler):
    X_test_raw = df.iloc[:, 1:-1].values
    y_test = df.iloc[:, -1].values

    X_test = scaler.transform(X_test_raw)
    print("Test preprocessing complete.")
    return X_test, y_test

# -----------------------------
# 3. Torch conversion
# -----------------------------
def convert_dataset_to_tensor(x_train, x_test, y_train, y_test):
    x_train = torch.tensor(x_train, dtype=torch.float32)
    x_test = torch.tensor(x_test, dtype=torch.float32)
    y_train = torch.tensor(y_train, dtype=torch.long)
    y_test = torch.tensor(y_test, dtype=torch.long)
    return x_train, x_test, y_train, y_test

# -----------------------------
# 4. Load + preprocess
# -----------------------------
train_path = "credit_train.csv"
test_path  = "credit_test.csv"

df_train = load_data(train_path)
df_test = load_data(test_path)

X_train_np, y_train_np, scaler = preprocess_train(df_train)
X_test_np,  y_test_np  = preprocess_test(df_test, scaler)

# (optional) save cleaned CSVs
save_cleaned_data(X_train_np, "credit_train_cleaned.csv")
save_cleaned_data(X_test_np,  "credit_test_cleaned.csv")

# -----------------------------
# 5. To tensors
# -----------------------------
x_train, x_test, y_train, y_test = convert_dataset_to_tensor(
    X_train_np, X_test_np, y_train_np, y_test_np
)



# Build DataLoader
#train_loader = convert_tensor_to_loader(x_train, y_train, batch_size=6)


Loading data from credit_train.csv
Loading data from credit_test.csv
Train preprocessing complete.
Test preprocessing complete.

Saving cleaned data to credit_train_cleaned.csv
Done.

Saving cleaned data to credit_test_cleaned.csv
Done.


In [12]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd

X_train, X_test, y_train, y_test= x_train, x_test, y_train, y_test



# Initialize models
models = {
    "Logistic Regression": LogisticRegression(max_iter=200, random_state=42),
    "Support Vector Machine (SVM)": SVC(kernel='linear', random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "K-Nearest Neighbors (KNN)": KNeighborsClassifier(n_neighbors=5),
    "Naïve Bayes": GaussianNB()
}

# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
       
    
    # Try to get a continuous score for ROC-AUC
    if hasattr(model, "predict_proba"):
        y_score = model.predict_proba(X_test)[:, 1]  # prob of class 1
    else:
        y_score = model.decision_function(X_test) 


        
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    AUC= roc_auc_score(y_test, y_score)
    # Print results for each model
    print(f"\nModel: {name}")
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print(f"AUC: {AUC * 100:.2f}%")
    print("Confusion Matrix:\n", conf_matrix)
    print("Classification Report:\n", class_report)
    
    
    # Identify correctly classified instances
    correct_indices = [i for i in range(len(y_test)) if y_test[i] == y_pred[i]]
    print("Correctly classified instances:")
    for i in correct_indices[:20]:  # Display first 10 correctly classified instances
        print(f"Instance {i}: {X_test[i]}, Label: {y_test[i]}")



Model: Logistic Regression
Accuracy: 93.75%
AUC: 97.39%
Confusion Matrix:
 [[185   2]
 [ 13  40]]
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.99      0.96       187
           1       0.95      0.75      0.84        53

    accuracy                           0.94       240
   macro avg       0.94      0.87      0.90       240
weighted avg       0.94      0.94      0.93       240

Correctly classified instances:
Instance 0: tensor([0.6223, 0.4816, 0.4555, 0.5625, 0.4190, 0.6638]), Label: 0
Instance 1: tensor([0.6727, 0.1717, 0.5933, 0.5000, 0.1810, 0.4886]), Label: 0
Instance 2: tensor([0.6749, 0.4827, 0.7928, 0.7500, 0.3380, 0.3860]), Label: 0
Instance 3: tensor([0.4979, 0.4082, 0.6302, 0.8750, 0.3680, 0.7179]), Label: 0
Instance 4: tensor([0.6813, 0.5572, 0.7798, 0.0000, 0.3680, 0.4160]), Label: 0
Instance 5: tensor([0.7618, 0.2657, 0.8796, 0.5000, 0.2310, 0.1467]), Label: 0
Instance 6: tensor([0.6384, 0.5140, 0.6692, 