In [None]:
!pip install huggingface_hub
!pip install xgboost
!pip install torch torchvision torchaudio
!pip install lightgbm
!pip install datasets

In [None]:
from huggingface_hub import hf_hub_download
import pandas as pd
import numpy as np
import random as rnd
import datetime

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc, classification_report, confusion_matrix
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold 

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F

import matplotlib.pyplot as plt
import matplotlib.colors
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode

from datasets import load_dataset
import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")

In [None]:
# train_df_orig = pd.read_csv('/scratch/tmparule/train_data.csv')
REPO_ID = "AdithyaM-16/DMdataset"
FILENAME = "train_data.csv"

train_df_orig = pd.read_csv(
    hf_hub_download(repo_id=REPO_ID, filename=FILENAME, repo_type="dataset")
)

In [None]:
# train_labels = pd.read_csv('/scratch/tmparule/train_labels.csv')
REPO_ID = "AdithyaM-16/DMdataset"
FILENAME = "train_labels.csv"

train_labels = pd.read_csv(
    hf_hub_download(repo_id=REPO_ID, filename=FILENAME, repo_type="dataset")
)

In [None]:
train_df_read = train_df_orig.merge(train_labels, on='customer_ID', how='left')

In [None]:
# temp=dict(layout=go.Layout(height=500, width=900))

target=train_df_read.target.value_counts(normalize=True)
target.rename(index={1:'Default',0:'Paid'},inplace=True)
pal, color=['#016CC9','#DEB078'], ['#8DBAE2','#EDD3B3']
fig=go.Figure()
fig.add_trace(go.Pie(labels=target.index, values=target*100, hole=.35, showlegend=True,sort=False, marker=dict(colors=color,line=dict(color=pal,width=2.5))))
# fig.update_layout(template=temp, title='Target Distribution', 
#                   legend=dict(traceorder='reversed',y=1.05,x=0),
#                   uniformtext_minsize=15, uniformtext_mode='hide',width=700)
fig.show()

In [None]:
#counting the statements per customer:
fig, (ax1) = plt.subplots(1, 1, figsize=(8, 5))
train_sc = train_df_read.customer_ID.value_counts().value_counts().sort_index(ascending=False).rename('Train statements per customer')
ax1.pie(train_sc, labels=train_sc.index)
ax1.set_title(train_sc.name)
plt.show()

In [None]:
#Frequency of customer statments:
plot_df=train_df_read.reset_index().groupby('S_2')['customer_ID'].nunique().reset_index()
fig=go.Figure()
fig.add_trace(go.Scatter(x=plot_df['S_2'], y=plot_df['customer_ID'], mode='lines',line=dict(color='skyblue', width=3)))
fig.update_layout( title="Frequency of Customer Statements", width=700,height=450,xaxis_title='Statement Date', yaxis_title='Number of Statements Issued')
fig.show()

In [None]:
#frequency distribution

delinquency_features = [c for c in train_df_read.columns if c.startswith('D_')]
spend_features = [c for c in train_df_read.columns if c.startswith('S_')]
payment_features = [c for c in train_df_read.columns if c.startswith('P_')]
balance_features = [c for c in train_df_read.columns if c.startswith('B_')]
risk_features = [c for c in train_df_read.columns if c.startswith('R_')]
labels=['Delinquency', 'Spend','Payment','Balance','Risk']
values= [len(delinquency_features), len(spend_features),len(payment_features), len(balance_features),len(risk_features)]


## First Plot
fig = go.Figure(data=[go.Pie(labels=labels, values=values,hole=.2)])

fig.update_traces(marker=dict(colors=['#a43725', '#e0d5bd', '#beb29e', '#E6b6a4', '#c07156']))
layout = dict(title = 'Feature Distribution',showlegend = True)
fig["layout"].update(layout)

fig.show()

In [None]:
#Distribution of categorical variable:
rgb=['rgba'+str(matplotlib.colors.to_rgba(i,0.7)) for i in pal]
cat_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68', 'target']
fig = make_subplots(rows=4, cols=3, subplot_titles=cat_cols[:-1], vertical_spacing=0.1)
row=0
c=[1,2,3]*5
plot_df=train_df_read[cat_cols]
for i,col in enumerate(cat_cols[:-1]):
    if i%3==0:
        row+=1
    plot_df[col]=plot_df[col].astype(object)
    df=plot_df.groupby(col)['target'].value_counts().rename('count').reset_index().replace('',np.nan)
    
    fig.add_trace(go.Bar(x=df[df.target==1][col], y=df[df.target==1]['count'],marker_color=rgb[1], marker_line=dict(color=pal[1],width=2),name='Default', showlegend=(True if i==0 else False)),row=row, col=c[i])
    fig.add_trace(go.Bar(x=df[df.target==0][col], y=df[df.target==0]['count'],marker_color=rgb[0], marker_line=dict(color=pal[0],width=2),name='Paid', showlegend=(True if i==0 else False)),row=row, col=c[i])
    if i%3==0:
        fig.update_yaxes(title='Frequency',row=row,col=c[i])
fig.update_layout(title="Distribution of Categorical Variables",
                  legend=dict(orientation="h",yanchor="bottom",y=1.03,xanchor="right",x=0.2),
                  barmode='group',height=1500,width=900)
fig.show()

In [None]:
#Test correlation with target:

temp=dict(layout=go.Layout(height=500, width=1000))

# Computes the pairwise Pearson correlation of all columns in the dataframe train.
# 178 x 178, number of features = 190 but we excluded the categorical features

train_df_2 = train_df_read.drop(['customer_ID', 'S_2', 'D_63', 'D_64'], axis = 1)
corr=train_df_2.corr()
# Filters the correlations to only those with the column Target,
# then sorts them in descending order, and
# excludes the correlation of Target with itself
corr=corr['target'].sort_values(ascending=False)[1:]

# Uses Seaborn to create a reversed red color palette with 135 different shades.
pal=sns.color_palette("Reds_r",135).as_hex()
# Converts the hexadecimal colors in pal to RGB format (with added alpha transparency) using Matplotlib.
rgb=['rgba'+str(matplotlib.colors.to_rgba(i,0.7)) for i in pal]

fig = go.Figure()

fig.add_trace(
    go.Bar(
        x=corr[corr>=0],
        y=corr[corr>=0].index, 
        marker_color=rgb,
        orientation='h', 
        marker_line=dict(color=pal,width=2), 
        name='',
        showlegend=False
    )
)

pal=sns.color_palette("Blues",100).as_hex()
rgb=['rgba'+str(matplotlib.colors.to_rgba(i,0.7)) for i in pal]

# Adds the positively correlated features as horizontal bars to the figure using the colors defined earlier.
# Sets the orientation of the bars to horizontal, specifies hover information, 
# and ensures that these bars won't appear in a legend.
fig.add_trace(
    go.Bar(
        x=corr[corr<0], 
        y=corr[corr<0].index, 
        marker_color=rgb[25:], 
        orientation='h', 
        marker_line=dict(color=pal[25:],width=2), 
        name='',
        showlegend=False
    )
)

fig.update_layout(
    template=temp,
    title="Feature Correlations with Target",
    xaxis_title="Correlation", 
    margin=dict(l=150),
    height=3000, 
    width=700, 
    hovermode='closest' #ensures that the closest data point to the hover position will be shown in the hover label.
)
fig.show()

In [None]:
train_df_read.sort_values(by='S_2', inplace=True)
train_df = train_df_read.groupby('customer_ID').tail(1).set_index('customer_ID', drop=True).sort_index()

In [None]:
categorical_cols = ['S_2', 'B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
cat_cols = [col for col in categorical_cols if col in train_df.columns]

enc = OrdinalEncoder()
# categorical_cols.remove('D_66')

train_df[cat_cols] = enc.fit_transform(train_df[cat_cols])
#[cat_cols] = enc.transform(test_df[cat_cols])

In [None]:
#train_df_ini = train_df.drop(['customer_ID'], axis = 1)

In [None]:
# train_df_ini = train_df
median_values_train = train_df.median()
train_df_ini = train_df.fillna(median_values_train)

In [None]:
# train_df_initial = train_df.drop['']
feature_list = [col for col in train_df_ini if col not in ['target']]

X = train_df_ini[feature_list]
y = train_df_ini['target']

print("X shape ", X.shape)
print("y shape ", y.shape)

In [None]:


x_train,x_test,y_train,y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

print(f"X_train shape is = ",x_train.shape)
print(f"Y_train shape is = ",y_train.shape)
print(f"X_test shape is = ",x_test.shape)
print(f"Y_test shape is = ",y_test.shape)

In [None]:
logreg_model = LogisticRegression(random_state=42)

start_time = datetime.datetime.now()
logreg_model.fit(x_train, y_train)
y_pred = logreg_model.predict(x_test)
end_time = datetime.datetime.now()

# Calculate the time taken
time_taken = end_time - start_time

# Print the time taken
print("Time taken(ms):", time_taken.total_seconds()*1000)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1score = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
# classification_rep = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-score: {f1score}')


In [None]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

start_time = datetime.datetime.now()

rf_classifier.fit(x_train, y_train)
y_pred = rf_classifier.predict(x_test)

end_time = datetime.datetime.now()
time_taken = end_time - start_time
print("Time taken(ms):", time_taken.total_seconds()*1000)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1score = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
# classification_rep = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-score: {f1score}')

In [None]:
import xgboost as xgb
from sklearn.metrics import roc_curve, auc

# Train XGBoost model
model = xgb.XGBClassifier(
    objective='binary:logistic',  # Specify the learning task and the corresponding objective
    max_depth=3,  # Maximum depth of a tree
    learning_rate=0.1,  # Learning rate
    n_estimators=10  # Number of boosting rounds
)

model.fit(x_train, y_train)

y_pred_xg = model.predict(x_test)

# end_time = datetime.datetime.now()
# time_taken = end_time - start_time
# print("Time taken(ms):", time_taken.total_seconds()*1000)

accuracy_xg = accuracy_score(y_test, y_pred_xg)
precision_xg = precision_score(y_test, y_pred_xg)
recall_xg = recall_score(y_test, y_pred_xg)
f1score_xg = f1_score(y_test, y_pred_xg)
conf_matrix_xg = confusion_matrix(y_test, y_pred_xg)
# classification_rep = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy_xg}')
print(f'Precision: {precision_xg}')
print(f'Recall: {recall_xg}')
print(f'F1-score: {f1score_xg}')


In [None]:
NaN_Val = np.array(train_df.isnull().sum())
NaN_prec = np.array((train_df.isnull().sum() * 100 / len(train_df)).round(2))
NaN_Col = pd.DataFrame([np.array(list(train_df.columns)).T,NaN_Val.T,NaN_prec.T,np.array(list(train_df_read.dtypes)).T], index=['Features','Num of Missing values','Percentage','DataType']
).transpose()
# pd.set_option('display.max_rows', None)
pd.set_option('display.max_rows', None)  # Show all rows
NaN_Col

In [None]:
null_threshold = 80

nan_cols = NaN_Col[NaN_Col['Percentage']>null_threshold]['Features'].to_list()
# nan_cols
train_df_2 = train_df.drop(nan_cols, axis=1)

In [None]:
train_df_2.shape

In [None]:
median_values_train = train_df_2.median()
train_df_2.fillna(median_values_train, inplace=True)

In [None]:
print(train_df_2.isnull().sum().to_string())

In [None]:
train_without_target_df = train_df_2.drop(['target'], axis = 1)

corr_matrix = train_without_target_df.corr()

In [None]:
corr_limit = 0.9

col_high_corr = set()

for i in range(len(corr_matrix.columns)):
    for j in range(i):
        if(corr_matrix.iloc[i, j] > corr_limit):
            colname = corr_matrix.columns[i]
            col_high_corr.add(colname)
            
col_high_corr

In [None]:
train_df_2 = train_df_2.drop(col_high_corr, axis=1)

In [None]:
train_df_2.shape

In [None]:
feature_list = [col for col in train_df_2 if col not in ['target']]

X = train_df_2[feature_list]
y = train_df_2['target']

print("X shape ", X.shape)
print("y shape ", y.shape)

In [None]:

x_train,x_test,y_train,y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

print(f"X_train shape is = ",x_train.shape)
print(f"Y_train shape is = ",y_train.shape)
print(f"X_test shape is = ",x_test.shape)
print(f"Y_test shape is = ",y_test.shape)

In [None]:
logreg_model = LogisticRegression(random_state=42)

start_time = datetime.datetime.now()
logreg_model.fit(x_train, y_train)
y_pred_log = logreg_model.predict(x_test)
end_time = datetime.datetime.now()

# Calculate the time taken
time_taken = end_time - start_time

# Print the time taken
print("Time taken(ms):", time_taken.total_seconds()*1000)

accuracy_log = accuracy_score(y_test, y_pred_log)
precision_log = precision_score(y_test, y_pred_log)
recall_log = recall_score(y_test, y_pred_log)
f1score_log = f1_score(y_test, y_pred_log)
conf_matrix_log = confusion_matrix(y_test, y_pred_log)
# classification_rep = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy_log}')
print(f'Precision: {precision_log}')
print(f'Recall: {recall_log}')
print(f'F1-score: {f1score_log}')
# print(f'Confusion Matrix:\n{conf_matrix}')
# print(f'Classification Report:\n{classification_rep}')# Plot confusion matrix as heatmap
plt.figure(figsize=(4, 3))
sns.heatmap(conf_matrix_log, annot=True, cmap='Blues', fmt='g', xticklabels=['Predicted 0', 'Predicted 1'], yticklabels=['Actual 0', 'Actual 1'])
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion Matrix')
plt.show()

# Predict probabilities instead of labels
y_pred_proba = logreg_model.predict_proba(x_test)[:, 1]  # Predict probabilities for class 1

# Calculate false positive rate, true positive rate, and thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)

# Calculate area under the curve (AUC)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(4, 3))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()

In [None]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

start_time = datetime.datetime.now()

rf_classifier.fit(x_train, y_train)
y_pred_rf = rf_classifier.predict(x_test)

end_time = datetime.datetime.now()
time_taken = end_time - start_time
print("Time taken(ms):", time_taken.total_seconds()*1000)

accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1score_rf = f1_score(y_test, y_pred_rf)
conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)
# classification_rep = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy_rf}')
print(f'Precision: {precision_rf}')
print(f'Recall: {recall_rf}')
print(f'F1-score: {f1score_rf}')
# print(f'Confusion Matrix:\n{conf_matrix}')
# print(f'Classification Report:\n{classification_rep}')# Plot confusion matrix as heatmap
plt.figure(figsize=(4, 3))
sns.heatmap(conf_matrix_rf, annot=True, cmap='Blues', fmt='g', xticklabels=['Predicted 0', 'Predicted 1'], yticklabels=['Actual 0', 'Actual 1'])
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion Matrix')
plt.show()

# Predict probabilities instead of labels
y_pred_proba_rf = rf_classifier.predict_proba(x_test)[:, 1]  # Predict probabilities for class 1

# Calculate false positive rate, true positive rate, and thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba_rf)

# Calculate area under the curve (AUC)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(4, 3))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()

In [None]:
import xgboost as xgb
from sklearn.metrics import roc_curve, auc

# Train XGBoost model
model = xgb.XGBClassifier(
    objective='binary:logistic',  # Specify the learning task and the corresponding objective
    max_depth=3,  # Maximum depth of a tree
    learning_rate=0.1,  # Learning rate
    n_estimators=10  # Number of boosting rounds
)

model.fit(x_train, y_train)

y_pred_xg = model.predict(x_test)

# end_time = datetime.datetime.now()
# time_taken = end_time - start_time
# print("Time taken(ms):", time_taken.total_seconds()*1000)

accuracy_xg = accuracy_score(y_test, y_pred_xg)
precision_xg = precision_score(y_test, y_pred_xg)
recall_xg = recall_score(y_test, y_pred_xg)
f1score_xg = f1_score(y_test, y_pred_xg)
conf_matrix_xg = confusion_matrix(y_test, y_pred_xg)
# classification_rep = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy_xg}')
print(f'Precision: {precision_xg}')
print(f'Recall: {recall_xg}')
print(f'F1-score: {f1score_xg}')
# print(f'Confusion Matrix:\n{conf_matrix}')
# print(f'Classification Report:\n{classification_rep}')# Plot confusion matrix as heatmap
plt.figure(figsize=(4, 3))
sns.heatmap(conf_matrix_xg, annot=True, cmap='Blues', fmt='g', xticklabels=['Predicted 0', 'Predicted 1'], yticklabels=['Actual 0', 'Actual 1'])
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion Matrix')
plt.show()

# Obtain predicted probabilities for the positive class
y_pred_proba_xg = model.predict_proba(x_test)[:, 1]

# Calculate false positive rate, true positive rate, and thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba_xg)

# Calculate area under the curve (AUC)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(4, 3))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()


In [None]:
# Convert data to PyTorch tensors
x_train_tensor = torch.tensor(x_train.to_numpy(), dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.to_numpy(), dtype=torch.long)
x_test_tensor = torch.tensor(x_test.to_numpy(), dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.to_numpy(), dtype=torch.long)


# Create datasets
train_dataset = TensorDataset(x_train_tensor, y_train_tensor)
test_dataset = TensorDataset(x_test_tensor, y_test_tensor)

# Create data loaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Define the neural decision tree model
class NeuralDecisionTree(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(NeuralDecisionTree, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, num_classes)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return F.softmax(x, dim=1)

# Initialize model
input_dim = x_train.shape[1]  # Assuming x_train is a 2D array
num_classes = len(torch.unique(y_train_tensor))
model = NeuralDecisionTree(input_dim, num_classes)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    # print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss}")


# Evaluation
model.eval()
y_pred_probs = []
y_true = []
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        y_pred_probs.extend(outputs.cpu().numpy())
        y_true.extend(labels.cpu().numpy())

y_pred_probs = torch.tensor(y_pred_probs)
y_true = torch.tensor(y_true)

_, y_pred = torch.max(y_pred_probs, 1)

# Calculate metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')
f1 = f1_score(y_true, y_pred, average='weighted')
# roc_auc = roc_auc_score(y_true, y_pred_probs, average='weighted', multi_class='ovr')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
# print(f"ROC-AUC Score: {roc_auc}")
plt.figure(figsize=(4, 3))
sns.heatmap(conf_matrix_ndt, annot=True, cmap='Blues', fmt='g', xticklabels=['Predicted 0', 'Predicted 1'], yticklabels=['Actual 0', 'Actual 1'])
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion Matrix')
plt.show()

# Assuming pred_values contains predicted probabilities or scores, and expected_values contains true labels
fpr, tpr, thresholds = roc_curve(y_test, y_pred_probs[:, 1])
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(4, 3))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()

In [None]:
categorical_cols=['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_68']

d_train = lgb.Dataset(x_train, label=y_train, categorical_feature = categorical_cols)

params = {'objective': 'binary','n_estimators': 1200,'metric': 'binary_logloss','boosting': 'gbdt','num_leaves': 90,'reg_lambda' : 50,'colsample_bytree': 0.19,'learning_rate': 0.03,'min_child_samples': 2400,'max_bins': 511,'seed': 42,'verbose': -1}

# trained model with 100 iterations
model = lgb.train(params, d_train, 100)

y_pred_prob_gm = model.predict(x_test)
#Map probablities to 0 or 1
y_pred_gbm = [1 if x >= 0.5 else 0 for x in y_pred_prob_gm]

accuracy_gbm = accuracy_score(y_test, y_pred_gbm)
precision_gbm = precision_score(y_test, y_pred_gbm)
recall_gbm = recall_score(y_test, y_pred_gbm)
f1score_gbm = f1_score(y_test, y_pred_gbm)
conf_matrix_gbm = confusion_matrix(y_test, y_pred_gbm)
# classification_rep = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy_gbm}')
print(f'Precision: {precision_gbm}')
print(f'Recall: {recall_gbm}')
print(f'F1-score: {f1score_gbm}')
# print(f'Confusion Matrix:\n{conf_matrix}')
# print(f'Classification Report:\n{classification_rep}')# Plot confusion matrix as heatmap
plt.figure(figsize=(4, 3))
sns.heatmap(conf_matrix_gbm, annot=True, cmap='Blues', fmt='g', xticklabels=['Predicted 0', 'Predicted 1'], yticklabels=['Actual 0', 'Actual 1'])
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion Matrix')
plt.show()

# Assuming pred_values contains predicted probabilities or scores, and expected_values contains true labels
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob_gm)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(4, 3))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()

In [None]:
x_train = X
y_train = y
# Convert data to PyTorch tensors
x_train_tensor = torch.tensor(x_train.to_numpy(), dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.to_numpy(), dtype=torch.long)
# x_test_tensor = torch.tensor(x_test.to_numpy(), dtype=torch.float32)
# y_test_tensor = torch.tensor(y_test.to_numpy(), dtype=torch.long)

# Define the neural decision tree model
class NeuralDecisionTree(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(NeuralDecisionTree, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, num_classes)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return F.softmax(x, dim=1)

# Initialize model
input_dim = x_train.shape[1]  # Assuming x_train is a 2D array
num_classes = len(torch.unique(y_train_tensor))
model = NeuralDecisionTree(input_dim, num_classes)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Define number of folds
num_folds = 5
kf = KFold(n_splits=num_folds, shuffle=True)

# Initialize lists to store metrics for each fold
fold_accuracy = []
fold_precision = []
fold_recall = []
fold_f1 = []

# Perform 5-fold cross-validation
for fold, (train_indices, test_indices) in enumerate(kf.split(x_train_tensor)):
    print(f"Fold {fold + 1}/{num_folds}")
    
    # Split data into train and validation sets
    x_train_fold = x_train_tensor[train_indices]
    y_train_fold = y_train_tensor[train_indices]
    x_val_fold = x_train_tensor[test_indices]
    y_val_fold = y_train_tensor[test_indices]

    # Create datasets
    train_dataset = TensorDataset(x_train_fold, y_train_fold)
    val_dataset = TensorDataset(x_val_fold, y_val_fold)

    # Create data loaders
    batch_size = 32
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    # Training loop
    num_epochs = 10
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

    # Evaluation
    model.eval()
    y_true = []
    y_pred = []
    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())

    # Calculate accuracy, precision, recall, and F1 score
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')

    # Append metrics to lists
    fold_accuracy.append(accuracy)
    fold_precision.append(precision)
    fold_recall.append(recall)
    fold_f1.append(f1)

    print(f"Accuracy on validation set: {accuracy}")
    print(f"Precision on validation set: {precision}")
    print(f"Recall on validation set: {recall}")
    print(f"F1 score on validation set: {f1}")

# Calculate average metrics over all folds
average_accuracy = sum(fold_accuracy) / num_folds
average_precision = sum(fold_precision) / num_folds
average_recall = sum(fold_recall) / num_folds
average_f1 = sum(fold_f1) / num_folds

print(f"Average accuracy over {num_folds} folds: {average_accuracy}")
print(f"Average precision over {num_folds} folds: {average_precision}")
print(f"Average recall over {num_folds} folds: {average_recall}")
print(f"Average F1 score over {num_folds} folds: {average_f1}")

In [None]:
categorical_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_68']
# x_train = x_train.drop(['Customer_ID'])
# Define LightGBM dataset
# d_train = lgb.Dataset(x_train, label=y_train, categorical_feature=categorical_cols)
x_train_lg = X
y_train_lg = y
y_train_lg.head()

# Define parameters
params = {
    'objective': 'binary',
    'n_estimators': 1200,
    'metric': 'binary_logloss',
    'boosting': 'gbdt',
    'num_leaves': 90,
    'reg_lambda': 50,
    'colsample_bytree': 0.19,
    'learning_rate': 0.03,
    'min_child_samples': 2400,
    'max_bins': 511,
    'seed': 42,
    'verbose': -1
}

# Perform 5-fold cross-validation
num_folds = 5
kf = KFold(n_splits=num_folds, shuffle=True)


fold_accuracy = []
fold_precision = []
fold_recall = []
fold_f1 = []

# for fold, (train_index, val_index) in enumerate(kf.split(x_train)):
#     print(fold, type(train_index), type(val_index))

for fold, (train_index, val_index) in enumerate(kf.split(x_train_lg)):
    print(f"Fold {fold+1}/{num_folds}")
    print(train_index, val_index)
    # Split data into train and validation sets
    x_train_fold, x_val_fold = x_train_lg.iloc[train_index], x_train_lg.iloc[val_index]
    y_train_fold, y_val_fold = y_train_lg.iloc[train_index], y_train_lg.iloc[val_index]

    # Define LightGBM dataset for the fold
    d_train_fold = lgb.Dataset(x_train_fold, label=y_train_fold, categorical_feature=categorical_cols)
    d_val_fold = lgb.Dataset(x_val_fold, label=y_val_fold, reference=d_train_fold)

    # Train the model
    model = lgb.train(params, d_train_fold, valid_sets=[d_val_fold])

    # Make predictions on the validation set
    y_pred_proba = model.predict(x_val_fold)

    # Round probabilities to get binary predictions
    y_pred = (y_pred_proba > 0.5).astype(int)

    # Calculate evaluation metrics for the fold
    accuracy = accuracy_score(y_val_fold, y_pred)
    precision = precision_score(y_val_fold, y_pred)
    recall = recall_score(y_val_fold, y_pred)
    f1 = f1_score(y_val_fold, y_pred)

    # Append metrics to lists
    fold_accuracy.append(accuracy)
    fold_precision.append(precision)
    fold_recall.append(recall)
    fold_f1.append(f1)

    print(f"Accuracy on validation set: {accuracy}")
    print(f"Precision on validation set: {precision}")
    print(f"Recall on validation set: {recall}")
    print(f"F1 score on validation set: {f1}")

# Calculate average metrics over all folds
average_accuracy = sum(fold_accuracy) / num_folds
average_precision = sum(fold_precision) / num_folds
average_recall = sum(fold_recall) / num_folds
average_f1 = sum(fold_f1) / num_folds

print(f"Average accuracy over {num_folds} folds: {average_accuracy}")
print(f"Average precision over {num_folds} folds: {average_precision}")
print(f"Average recall over {num_folds} folds: {average_recall}")
print(f"Average F1 score over {num_folds} folds: {average_f1}")
