In [1]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans

In [2]:
df=pd.read_csv("Task 3 and 4_Loan_Data.csv",index_col="customer_id")
df.head()

Unnamed: 0_level_0,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
8153374,0,5221.545193,3915.471226,78039.38546,5,605,0
7442532,5,1958.928726,8228.75252,26648.43525,2,572,1
2256073,0,3363.009259,2027.83085,65866.71246,4,602,0
4885975,0,4766.648001,2501.730397,74356.88347,5,612,0
4700614,1,1345.827718,1768.826187,23448.32631,6,631,0


In [3]:
import pandas as pd
import cupy as cp

def normalize_fico_df_gpu(df, fico_column="FICO"):
    # Copy to avoid modifying original df
    df_out = df.copy()

    # Move fico column to GPU
    fico_gpu = cp.array(df_out[fico_column].values, dtype=cp.float32)

    # Compute min & max on GPU
    min_f = fico_gpu.min()
    max_f = fico_gpu.max()

    # Normalize to range 1â€“5
    norm = 1 + 4 * (fico_gpu - min_f) / (max_f - min_f)

    # Round to nearest integer bucket and clip
    norm_int = cp.clip(cp.rint(norm), 1, 5)

    # Move back to CPU as numpy
    df_out[fico_column + "_normalized"] = norm_int.get().astype(int)

    return df_out



In [4]:
import cupy as cp
from sklearn.cluster import KMeans

def fico_to_rating_kmeans_gpu(df, fico_col="fico_score", buckets=5):

    # Move FICO column to GPU
    X_gpu = cp.asarray(df[fico_col].values).reshape(-1, 1)

    # Move data BACK to CPU for sklearn
    X_cpu = cp.asnumpy(X_gpu)

    # Run KMeans on CPU
    model = KMeans(n_clusters=buckets, random_state=42)
    labels = model.fit_predict(X_cpu)
    centers = model.cluster_centers_.flatten()

    # Assign rating to dataframe
    df["rating_kmeans"] = labels + 1

    # Compute MSE on CPU (NumPy only)
    mse = np.mean((X_cpu.flatten() - centers[labels]) ** 2)

    return df, centers, mse
import torch

def mse_gpu(fico, centers, labels):
    device = "cuda" if torch.cuda.is_available() else "cpu"

    fico_t = torch.tensor(fico, dtype=torch.float32, device=device)
    centers_t = torch.tensor(centers, dtype=torch.float32, device=device)
    labels_t = torch.tensor(labels, dtype=torch.long, device=device)

    mse = torch.mean((fico_t - centers_t[labels_t])**2)
    return mse.item()



In [5]:
import torch
import numpy as np

def loglik_gpu(k, n):
    """
    Computes log-likelihood for Bernoulli defaults per bucket.
    Works for scalar or vector k,n.
    Returns numpy scalar or numpy array.
    """

    # Convert to CUDA tensors
    k_gpu = torch.tensor(k, dtype=torch.float32, device="cuda")
    n_gpu = torch.tensor(n, dtype=torch.float32, device="cuda")

    # p = k/n with small epsilon to avoid log(0)
    eps = 1e-8
    p = torch.clamp(k_gpu / n_gpu, eps, 1 - eps)

    # log-likelihood
    ll = k_gpu * torch.log(p) + (n_gpu - k_gpu) * torch.log(1 - p)

    # Return numpy
    ll_cpu = ll.detach().cpu().numpy()

    # If scalar, return scalar
    if ll_cpu.ndim == 0:
        return float(ll_cpu)

    return ll_cpu




In [8]:
df1 = normalize_fico_df_gpu(df.copy(), fico_column="fico_score")

df2, centers_kmeans, mse_kmeans = fico_to_rating_kmeans_gpu(df1, fico_col="fico_score", buckets=5)


loglik_global = loglik_gpu(df1["default"].sum(), len(df1))

k = df1.groupby("rating_kmeans")["default"].sum().values
n = df1.groupby("rating_kmeans")["default"].count().values

loglik_bucket = loglik_gpu(k, n)




print("=== Simple Rating ===")
print(df1.head())

print("\n=== KMeans GPU Rating ===")
print(df2.head())
print("Cluster centers:", centers_kmeans)
print("MSE:", mse_kmeans)

print("\n=== DP Log-Likelihood Buckets ===")
print("Log-Likelihood Per Bucket", loglik_bucket)
print("Global Log-Likelihood",loglik_global)



=== Simple Rating ===
             credit_lines_outstanding  loan_amt_outstanding  \
customer_id                                                   
8153374                             0           5221.545193   
7442532                             5           1958.928726   
2256073                             0           3363.009259   
4885975                             0           4766.648001   
4700614                             1           1345.827718   

             total_debt_outstanding       income  years_employed  fico_score  \
customer_id                                                                    
8153374                 3915.471226  78039.38546               5         605   
7442532                 8228.752520  26648.43525               2         572   
2256073                 2027.830850  65866.71246               4         602   
4885975                 2501.730397  74356.88347               5         612   
4700614                 1768.826187  23448.32631        