In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import OneCycleLR

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.kernel_approximation import Nystroem
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from sklearn.calibration import CalibratedClassifierCV


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"‚ö° Processing Unit: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")
print(f"   Device Tensor: {device}")

## 0. Loading the dataset

In [None]:
train_path = 'finance/train_updated.csv'
test_path = 'finance/test_updated.csv'

try:
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
except FileNotFoundError:
    print("‚ùå Files not found. Ensure 'finance/train_updated.csv' exists.")
    raise


target = 'RiskFlag'
id_col = 'ProfileID'

X = train_df.drop([target, id_col], axis=1)
y = train_df[target]
X_test_submission = test_df.drop([id_col], axis=1)
test_ids = test_df[id_col]

X.head()

In [None]:
import math
import matplotlib.pyplot as plt
import seaborn as sns

num_cols = X.select_dtypes(include=['int64', 'float64']).columns

n = len(num_cols)
cols = 3                                           # number of columns you want
rows = math.ceil(n / cols)                         # auto compute rows needed

plt.figure(figsize=(6 * cols, 4 * rows))

for idx, col in enumerate(num_cols, 1):
    plt.subplot(rows, cols, idx)
    sns.boxplot(y=X[col])
    plt.title(col, fontsize=12)

plt.tight_layout()
plt.suptitle("Figure 1: Boxplots for numeric features", fontsize=14, y=0.02)
plt.show()


In [None]:
plt.figure(figsize=(6 * cols, 4 * rows))

for idx, col in enumerate(num_cols, 1):
    plt.subplot(rows, cols, idx)
    plt.hist(X[col], bins=30)
    plt.title(col, fontsize=12)

plt.tight_layout()
plt.suptitle("Figure 2: Histograms showing feature distributions", fontsize=14, y=0.02)
plt.show()


In [None]:
X.describe()

In [None]:
import pandas as pd

desc = X.describe().T  # transpose so each feature is a row
cols_per_table = 2     # change to 3 if you prefer wider tables
tables = []

for i in range(0, len(desc), cols_per_table):
    chunk = desc.iloc[i:i + cols_per_table]
    tables.append(chunk)

# Display chunks
for idx, t in enumerate(tables, 1):
    print(f"\n--- Table {idx} ---\n")
    display(t)


## 1. Preprocessing and EDA

In [None]:
binary_cols = ['OwnsProperty', 'FamilyObligation', 'JointApplicant']
for col in binary_cols:
    X[col] = X[col].map({'Yes': 1, 'No': 0})
    X_test_submission[col] = X_test_submission[col].map({'Yes': 1, 'No': 0})


qualification_order = ["High School", "Bachelor's", "Master's", "PhD"]
workcategory_order = ["Unemployed", "Part-time", "Full-time", "Self-employed"]

ordinal_cols = ['QualificationLevel', 'WorkCategory']
ordinal_encoder = OrdinalEncoder(categories=[qualification_order, workcategory_order], 
                                 handle_unknown='use_encoded_value', unknown_value=-1)

X[ordinal_cols] = ordinal_encoder.fit_transform(X[ordinal_cols])
X_test_submission[ordinal_cols] = ordinal_encoder.transform(X_test_submission[ordinal_cols])


nominal_cols = ['RelationshipStatus', 'FundUseCase']
numerical_cols = [c for c in X.columns if c not in ordinal_cols + nominal_cols + binary_cols]



print(" Identifying Outliers ")
Q1 = X[numerical_cols].quantile(0.25)
Q3 = X[numerical_cols].quantile(0.75)
IQR = Q3 - Q1
outlier_condition = ((X[numerical_cols] < (Q1 - 1.5 * IQR)) | (X[numerical_cols] > (Q3 + 1.5 * IQR))).any(axis=1)

X = X[~outlier_condition]
y = y[~outlier_condition]
print(f"‚úÖ Removed {outlier_condition.sum()} outliers. Remaining: {len(X)} samples.")

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('nom', OneHotEncoder(handle_unknown='ignore', sparse_output=False), nominal_cols),
        ('ord', 'passthrough', ordinal_cols),
        ('bin', 'passthrough', binary_cols)
    ]
)

X_processed = preprocessor.fit_transform(X)
X_test_processed = preprocessor.transform(X_test_submission)

X_train, X_val, y_train, y_val = train_test_split(X_processed, y, test_size=0.2, stratify=y, random_state=42)

print(f"‚úÖ Data Processed. Input Dimensions: {X_train.shape[1]} features.")


## 20% dataset

In [None]:
X_mini_processed, _, y_mini, _ = train_test_split(
    X_processed, y,
    test_size=0.8,
    stratify=y,
    random_state=42
)

X_mini_processed.shape

In [None]:
from ydata_profiling import ProfileReport

print("‚è≥ Generating EDA Profile Report... ")


profile = ProfileReport(
    train_df, 
    title="Loan Risk EDA Report", 
    minimal=True, 
    explorative=True
)

profile.to_file("finance_eda_report.html")

print("‚úÖ Report generated: 'finance_eda_report.html'")

## 2. SVM

In [None]:
print("‚è≥ Training SVM... ")


feature_map = Nystroem(kernel='rbf', gamma=0.1, n_components=2500, random_state=42)


svm_clf = LinearSVC(dual=False, C=1.0, class_weight='balanced', max_iter=2000, random_state=42)


svm_pipeline = Pipeline([
    ('feature_map', feature_map),
    ('svm_calibrated', CalibratedClassifierCV(svm_clf, cv=3)) 
])

svm_pipeline.fit(X_processed, y)



print(f"‚úÖ SVM Training Complete")

## 3. Logistic Regression & Bayesian Classifier


In [None]:
print("‚è≥ Training Logistic Regression...")
lr_model = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
lr_model.fit(X_processed, y)

# y_val_probs_lr = lr_model.predict_proba(X_val)[:, 1]
# lr_auc = roc_auc_score(y_val, y_val_probs_lr)
print(f"‚úÖ Logistic Regression Complete")


print("‚è≥ Training Gaussian Naive Bayes...")
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

# y_val_probs_nb = nb_model.predict_proba(X_val)[:, 1]
# nb_auc = roc_auc_score(y_val, y_val_probs_nb)
print(f"‚úÖ Naive Bayes Complete")

## 4. Neural Networks

In [None]:

class LoanDataset(Dataset):
    def __init__(self, features, labels=None):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.labels = torch.tensor(labels.values, dtype=torch.float32).unsqueeze(1) if labels is not None else None

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        if self.labels is not None:
            return self.features[idx], self.labels[idx]
        return self.features[idx]


class ResBlock(nn.Module):
    def __init__(self, n_features, dropout_rate=0.3):
        super(ResBlock, self).__init__()
        self.block = nn.Sequential(
            nn.Linear(n_features, n_features),
            nn.BatchNorm1d(n_features),
            nn.GELU(),
            nn.Dropout(dropout_rate),
            nn.Linear(n_features, n_features),
            nn.BatchNorm1d(n_features)
        )
        self.activation = nn.GELU()

    def forward(self, x):
        identity = x
        out = self.block(x)
        out += identity  
        return self.activation(out)

class DeepRiskNet(nn.Module):
    def __init__(self, input_dim):
        super(DeepRiskNet, self).__init__()
        
        
        self.input_layer = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.BatchNorm1d(512),
            nn.GELU()
        )
        
        
        self.res_blocks = nn.Sequential(
            ResBlock(512, dropout_rate=0.4),
            ResBlock(512, dropout_rate=0.4),
            nn.Linear(512, 256),
            nn.GELU(),
            ResBlock(256, dropout_rate=0.3),
            ResBlock(256, dropout_rate=0.3)
        )
        
        
        self.output_head = nn.Sequential(
            nn.Linear(256, 64),
            nn.GELU(),
            nn.Linear(64, 1)
        )
        
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.input_layer(x)
        x = self.res_blocks(x)
        x = self.output_head(x)
        return self.sigmoid(x)


BATCH_SIZE = 1024  
EPOCHS = 50
LEARNING_RATE = 0.01


train_dataset = LoanDataset(X_train, y)
val_dataset = LoanDataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, pin_memory=True)


model = DeepRiskNet(X_train.shape[1]).to(device)
criterion = nn.BCELoss()
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-4)


scheduler = OneCycleLR(optimizer, max_lr=LEARNING_RATE, steps_per_epoch=len(train_loader), epochs=EPOCHS)


print("üöÄ Starting Training...")
best_val_auc = 0
patience = 10
patience_counter = 0

for epoch in range(EPOCHS):
    model.train()
    train_loss = 0
    
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        
        optimizer.zero_grad()
        y_pred = model(X_batch)
        loss = criterion(y_pred, y_batch)
        loss.backward()
        optimizer.step()
        scheduler.step()
        train_loss += loss.item()
    
    
    model.eval()
    all_val_preds = []
    all_val_labels = []
    with torch.no_grad():
        for X_val_b, y_val_b in val_loader:
            X_val_b = X_val_b.to(device)
            preds = model(X_val_b)
            all_val_preds.extend(preds.cpu().numpy())
            all_val_labels.extend(y_val_b.numpy())
    
    val_auc = roc_auc_score(all_val_labels, all_val_preds)
    
    print(f"Epoch {epoch+1}/{EPOCHS} | Train Loss: {train_loss/len(train_loader):.4f} | Val ROC-AUC: {val_auc:.4f}")
    
    
    if val_auc > best_val_auc:
        best_val_auc = val_auc
        torch.save(model.state_dict(), 'best_model.pth')
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("‚èπÔ∏è Early stopping triggered.")
            break


model.load_state_dict(torch.load('best_model.pth', weights_only=True))
print(f"üèÜ Best Val AUC: {best_val_auc:.4f}")

In [None]:
val_dataset = LoanDataset(X_train, y_train)
train_dataset = LoanDataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, pin_memory=True)


model20 = DeepRiskNet(X_train.shape[1]).to(device)
criterion = nn.BCELoss()
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-4)


scheduler = OneCycleLR(optimizer, max_lr=LEARNING_RATE, steps_per_epoch=len(train_loader), epochs=EPOCHS)


print("üöÄ Starting Training...")
best_val_auc = 0
patience = 10
patience_counter = 0

for epoch in range(EPOCHS):
    model20.train()
    train_loss = 0
    
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        
        optimizer.zero_grad()
        y_pred = model(X_batch)
        loss = criterion(y_pred, y_batch)
        loss.backward()
        optimizer.step()
        scheduler.step()
        train_loss += loss.item()
    
    
    model20.eval()
    all_val_preds = []
    all_val_labels = []
    with torch.no_grad():
        for X_val_b, y_val_b in val_loader:
            X_val_b = X_val_b.to(device)
            preds = model(X_val_b)
            all_val_preds.extend(preds.cpu().numpy())
            all_val_labels.extend(y_val_b.numpy())
    
    val_auc = roc_auc_score(all_val_labels, all_val_preds)
    
    print(f"Epoch {epoch+1}/{EPOCHS} | Train Loss: {train_loss/len(train_loader):.4f} | Val ROC-AUC: {val_auc:.4f}")
    
    
    if val_auc > best_val_auc:
        best_val_auc = val_auc
        torch.save(model.state_dict(), 'best_model.pth')
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("‚èπÔ∏è Early stopping triggered.")
            break


# model.load_state_dict(torch.load('best_model_20.pth', weights_only=True))
print(f"üèÜ Best Val AUC: {best_val_auc:.4f}")

In [None]:
print("‚è≥ Training Logistic Regression...")
lr_model_2 = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
lr_model_2.fit(X_val, y_val)

In [None]:
print("‚è≥ Training Gaussian Naive Bayes...")
nb_model_20 = GaussianNB()
nb_model_20.fit(X_val, y_val)

# y_val_probs_nb_20 = nb_model_20.predict_proba(X_val)[:, 1]
# nb_auc = roc_auc_score(y_val, y_val_probs_nb)
print(f"‚úÖ Naive Bayes 20 Complete")

## 5. Generate Separate Submission Files

In [None]:

def create_submission(ids, predictions, filename, method_name):
    sub_df = pd.DataFrame({'ProfileID': ids, 'RiskFlag': predictions})
    sub_df.to_csv(filename, index=False)
    print(f"‚úÖ Generated '{filename}' ({method_name})")


test_tensor = torch.tensor(X_test_processed, dtype=torch.float32).to(device)
model.eval()
with torch.no_grad():
    nn_probs = model(test_tensor).cpu().numpy().flatten()
    nn_preds = (nn_probs > 0.5).astype(int)
create_submission(test_ids, nn_preds, 'finance_sub/submission_nn.csv', 'Neural Network')


svm_preds = svm_pipeline.predict(X_test_processed)
create_submission(test_ids, svm_preds, 'finance_sub/submission_svm.csv', 'SVM')


lr_preds = lr_model.predict(X_test_processed)
create_submission(test_ids, lr_preds, 'finance_sub/submission_lr.csv', 'Logistic Regression')


nb_preds = nb_model.predict(X_test_processed)
create_submission(test_ids, nb_preds, 'finance_sub/submission_nb.csv', 'Naive Bayes')

In [None]:
print("‚è≥ Training SVM... ")


feature_map = Nystroem(kernel='rbf', gamma=0.1, n_components=2500, random_state=42)


svm_clf_20 = LinearSVC(dual=False, C=1.0, class_weight='balanced', max_iter=2000, random_state=42)


svm_pipeline_1 = Pipeline([
    ('feature_map', feature_map),
    ('svm_calibrated', CalibratedClassifierCV(svm_clf_20, cv=3)) 
])

svm_pipeline_1.fit(X_val, y_val) 

## 6. Inference Code

In [None]:
from sklearn.metrics import f1_score
from tabulate import tabulate

results = []  


val_tensor = torch.tensor(X_val, dtype=torch.float32).to(device)
model.eval()

with torch.no_grad():
    nn_probs = model(val_tensor).cpu().numpy().flatten()
    nn_preds = (nn_probs > 0.5).astype(int)

nn_f1 = f1_score(y_val, nn_preds, average="weighted")
results.append(["Neural Network", f"{nn_f1:.4f}"])


val_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
model20.eval()

with torch.no_grad():
    nn_probs = model20(val_tensor).cpu().numpy().flatten()
    nn_preds = (nn_probs > 0.5).astype(int)

nn_f1_20 = f1_score(y_train, nn_preds, average="weighted")
results.append(["Neural Network 20", f"{nn_f1_20:.4f}"])

svm_preds = svm_pipeline.predict(X_val)
svm_f1 = f1_score(y_val, svm_preds, average="weighted")
results.append(["SVM", f"{svm_f1:.4f}"])

svm_preds2 = svm_pipeline_1.predict(X_train)
svm_f2 = f1_score(y_train, svm_preds2, average="weighted")
results.append(["SVM20", f"{svm_f2:.4f}"])

lr_preds = lr_model.predict(X_val)
lr_f1 = f1_score(y_val, lr_preds, average="weighted")
results.append(["Logistic Regression", f"{lr_f1:.4f}"])

lr_preds = lr_model_2.predict(X_train)
lr_f1 = f1_score(y_train, lr_preds, average="weighted")
results.append(["Logistic Regression 20", f"{lr_f1:.4f}"])

nb_preds = nb_model.predict(X_val)
nb_f1 = f1_score(y_val, nb_preds, average="weighted")
results.append(["Naive Bayes", f"{nb_f1:.4f}"])

nb_preds = nb_model_20.predict(X_val)
nb_f1_20 = f1_score(y_val, nb_preds, average="weighted")
results.append(["Naive Bayes 20", f"{nb_f1_20:.4f}"])


print(tabulate(results, headers=["Model", "F1 Score"], tablefmt="fancy_grid"))

## 20% SVM

In [None]:
X_mini_train, X_mini_test, y_mini_train, y_mini_test=train_test_split(X_mini_processed, y_mini, test_size=0.2)

In [None]:
print("‚è≥ Training SVM... ")


feature_map = Nystroem(kernel='rbf', gamma=0.1, n_components=2500, random_state=42)


svm_clf20 = LinearSVC(dual=False, C=1.0, class_weight='balanced', max_iter=2000, random_state=42)


svm_pipeline = Pipeline([
    ('feature_map', feature_map),
    ('svm_calibrated', CalibratedClassifierCV(svm_clf20, cv=3)) 
])

svm_pipeline.fit(X_mini_train, y_mini_train)


print(f"‚úÖ SVM Training Complete")

## 20% Neural Network

In [None]:

class LoanDataset(Dataset):
    def __init__(self, features, labels=None):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.labels = torch.tensor(labels.values, dtype=torch.float32).unsqueeze(1) if labels is not None else None

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        if self.labels is not None:
            return self.features[idx], self.labels[idx]
        return self.features[idx]


class ResBlock(nn.Module):
    def __init__(self, n_features, dropout_rate=0.3):
        super(ResBlock, self).__init__()
        self.block = nn.Sequential(
            nn.Linear(n_features, n_features),
            nn.BatchNorm1d(n_features),
            nn.GELU(),
            nn.Dropout(dropout_rate),
            nn.Linear(n_features, n_features),
            nn.BatchNorm1d(n_features)
        )
        self.activation = nn.GELU()

    def forward(self, x):
        identity = x
        out = self.block(x)
        out += identity  
        return self.activation(out)

class DeepRiskNet(nn.Module):
    def __init__(self, input_dim):
        super(DeepRiskNet, self).__init__()
        
        
        self.input_layer = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.BatchNorm1d(512),
            nn.GELU()
        )
        
        
        self.res_blocks = nn.Sequential(
            ResBlock(512, dropout_rate=0.4),
            ResBlock(512, dropout_rate=0.4),
            nn.Linear(512, 256),
            nn.GELU(),
            ResBlock(256, dropout_rate=0.3),
            ResBlock(256, dropout_rate=0.3)
        )
        
        
        self.output_head = nn.Sequential(
            nn.Linear(256, 64),
            nn.GELU(),
            nn.Linear(64, 1)
        )
        
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.input_layer(x)
        x = self.res_blocks(x)
        x = self.output_head(x)
        return self.sigmoid(x)


BATCH_SIZE = 1024  
EPOCHS = 50
LEARNING_RATE = 0.01

In [None]:
mini_train_dataset = LoanDataset(X_mini_train, y_mini_train)

mini_train_loader = DataLoader(
    mini_train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    pin_memory=True
)


model20 = DeepRiskNet(X_mini_train.shape[1]).to(device)

criterion = nn.BCELoss()
optimizer = optim.AdamW(model20.parameters(), lr=LEARNING_RATE, weight_decay=1e-4)

scheduler = OneCycleLR(
    optimizer,
    max_lr=LEARNING_RATE,
    steps_per_epoch=len(mini_train_loader),
    epochs=EPOCHS
)

print("üöÄ Starting Training on X_mini ONLY...")

for epoch in range(EPOCHS):
    model20.train()
    train_loss = 0
    
    for X_batch, y_batch in mini_train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        y_pred = model20(X_batch)
        loss = criterion(y_pred, y_batch)

        loss.backward()
        optimizer.step()
        scheduler.step()

        train_loss += loss.item()

    print(f"Epoch {epoch+1}/{EPOCHS} | Train Loss: {train_loss/len(mini_train_loader):.4f}")

print("üèÅ Training complete on mini dataset.")

In [None]:
from sklearn.metrics import f1_score
from tabulate import tabulate

results = []




X_test_tensor = torch.tensor(X_mini_test, dtype=torch.float32).to(device)
model20.eval()

with torch.no_grad():
    nn_probs = model20(X_test_tensor).cpu().numpy().flatten()
    nn_preds = (nn_probs > 0.5).astype(int)

nn_f1 = f1_score(y_mini_test, nn_preds, average="weighted")
results.append(["Neural Network (mini)", f"{nn_f1:.4f}"])





svm_preds = svm_pipeline.predict(X_mini_test)
svm_f1 = f1_score(y_mini_test, svm_preds, average="weighted")
results.append(["SVM (mini)", f"{svm_f1:.4f}"])





print(tabulate(results, headers=["Model", "Test F1 Score"], tablefmt="fancy_grid"))
