In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier 

from sklearn.metrics import confusion_matrix,classification_report
from sklearn.utils.class_weight import compute_class_weight

import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm


import warnings
warnings.filterwarnings('ignore') 

# Preprocess

In [7]:
data = pd.read_csv("data.csv").sample(1000)

In [8]:
train_val,test = train_test_split(data,test_size=.3,random_state=51,shuffle=True)
### train daha sonra train-validation olarak ayrılacaktır.

In [9]:
def preprocess(data):
    
    data.drop('customer_id',axis=1,inplace=True)
    data.country.replace({"France":0,"Spain":1,"Germany":2},inplace=True)
    data.gender.replace({"Female":0,"Male":1},inplace=True)
    age_groups = [[18,34],[35,46],[47,56],[57,65],[66,74],[75,200]]

    for i,group in enumerate(age_groups):
        data['age'].loc[(data['age'] >= group[0]) &( (data['age']) <= group[1])] = i 
    
    balance_mean = int(data.balance.unique().mean())
    balance_groups = [[0,10000],[10001,balance_mean],[balance_mean+1,10e+9]]

    for i,group in enumerate(balance_groups):
        data['balance'].loc[(data['balance'] >= group[0]) &( (data['balance']) <= group[1])] = i 

    income_groups = [[0,1045*12],[(1045*12)+1,4125*12],
                     [(4125*12)+1,12745*12],[(12745*12)+1,10e+9]] # low/lower-middle/upper-middle/high income groups
    for i,group in enumerate(income_groups):
        data['estimated_salary'].loc[(data['estimated_salary'] >= group[0]) &( (data['estimated_salary']) <= group[1])] = i 

        
    unique_vals,counts = np.unique(data.credit_score,return_counts = True)
    credit_score_mean = unique_vals.mean()
    data['credit_score'].loc[data["credit_score"] <= credit_score_mean] = 0 
    data['credit_score'].loc[data["credit_score"] > credit_score_mean] = 1
    
    data['balance'] = data['balance'].astype(int)
    data['estimated_salary'] = data['estimated_salary'].astype(int)
    
    return np.array(data)

In [10]:
train_val_dataset = preprocess(train_val)
test_dataset = preprocess(test)

In [11]:
train,val = train_test_split(train_val_dataset,test_size=3/7,random_state=51,shuffle=True)


In [12]:
x_train,y_train = train[:,:-1],train[:,-1]
x_val,y_val = val[:,:-1],val[:,-1]

In [13]:
class Categorizer():
    
    def __init__(self,n_classes):
        self.n_classes = n_classes
        
    def _make_disjoint_categ(self,data):
        
        disjoint = [nn.functional.one_hot(torch.tensor((data[:,idx])),self.n_classes[idx]) for idx in range(data.shape[1])]        
        return disjoint
    
    def __call__(self,data):
        
        disjoint = self._make_disjoint_categ(data)
        return torch.cat(disjoint,dim = 1).to(dtype = torch.float32)
        
        

In [14]:
categorizer = Categorizer(x_train.max(axis=0)+1)
x_train_one_hot = categorizer(x_train)
x_val_one_hot = categorizer(x_val)

# Models

## Naive Bayes

In [15]:
nb_model = MultinomialNB()
nb_model.fit(x_train_one_hot, (y_train))

In [66]:
print(classification_report(y_train,nb_model.predict_proba(x_train_one_hot).argmax(axis=1)))

              precision    recall  f1-score   support

           0       0.89      0.95      0.92       315
           1       0.77      0.58      0.66        85

    accuracy                           0.87       400
   macro avg       0.83      0.76      0.79       400
weighted avg       0.87      0.87      0.87       400



In [67]:
print(classification_report(y_val,nb_model.predict_proba(x_val_one_hot).argmax(axis=1)))

              precision    recall  f1-score   support

           0       0.85      0.95      0.90       236
           1       0.67      0.38      0.48        64

    accuracy                           0.83       300
   macro avg       0.76      0.66      0.69       300
weighted avg       0.81      0.83      0.81       300



In [17]:
confusion_matrix(y_val,nb_model.predict_proba(x_val_one_hot).argmax(axis=1))

array([[224,  12],
       [ 40,  24]])

## DecisionTree Model 

In [62]:
clf = DecisionTreeClassifier(max_depth =3, random_state = 42)

In [63]:
clf.fit(x_train_one_hot,y_train)

In [64]:
print(classification_report(y_train,clf.predict_proba(x_train_one_hot).argmax(axis=1)))

              precision    recall  f1-score   support

           0       0.87      0.97      0.92       315
           1       0.80      0.48      0.60        85

    accuracy                           0.86       400
   macro avg       0.84      0.73      0.76       400
weighted avg       0.86      0.86      0.85       400



In [65]:
print(classification_report(y_val,clf.predict_proba(x_val_one_hot).argmax(axis=1)))

              precision    recall  f1-score   support

           0       0.84      0.99      0.91       236
           1       0.86      0.30      0.44        64

    accuracy                           0.84       300
   macro avg       0.85      0.64      0.67       300
weighted avg       0.84      0.84      0.81       300



In [21]:
confusion_matrix(y_val,clf.predict_proba(x_val_one_hot).argmax(axis=1))

array([[233,   3],
       [ 45,  19]])

## DeepLearning Model

In [142]:

class DL_Model(nn.Module):
    
    def __init__(self,in_features):
        
        super().__init__()
        self.classifier = nn.Sequential(
        
            nn.Linear(in_features,in_features*2),
            nn.GELU(),
            nn.Dropout(.25),    
            nn.Linear(in_features*2,in_features*2),
            nn.GELU(),
            nn.Dropout(.25),
            nn.Linear(in_features*2,2),
                    
        )
        
    def forward(self,x):
        return self.classifier(x)
        
    

In [143]:
class Dataset():
    
    def __init__(self,x,y):
        self.x = x
        self.y = y
        
    def __len__(self,):
        return len(self.x)
    
    def __getitem__(self,index):
        return torch.tensor(self.x[index]),torch.tensor(self.y[index])
        

In [144]:
class LabelSmoothingLoss(torch.nn.Module):
    def __init__(self, smoothing: float = 0.1, 
                 reduction="mean", weight=None):
        super(LabelSmoothingLoss, self).__init__()
        self.smoothing   = smoothing
        self.reduction = reduction
        self.weight    = weight

    def reduce_loss(self, loss):
        return loss.mean() if self.reduction == 'mean' else loss.sum() \
         if self.reduction == 'sum' else loss

    def linear_combination(self, x, y):
        return self.smoothing * x + (1 - self.smoothing) * y

    def forward(self, preds, target):
        assert 0 <= self.smoothing < 1

        if self.weight is not None:
            self.weight = self.weight.to(preds.device)

        n = preds.size(-1)
        log_preds = F.log_softmax(preds, dim=-1)
        loss = self.reduce_loss(-log_preds.sum(dim=-1))
        nll = F.nll_loss(
            log_preds, target, reduction=self.reduction, weight=self.weight
        )
        return self.linear_combination(loss / n, nll)

In [145]:
dl_model = DL_Model(40)



In [146]:
train_dataset = Dataset(x_train_one_hot,y_train)
a,b = np.unique(y_train,return_counts=True)
weights = compute_class_weight(class_weight='balanced',classes=a,y=y_train)

In [147]:
optim = torch.optim.Adam(dl_model.parameters(),lr = 0.0001)
loss_fn = LabelSmoothingLoss(.18,weight=torch.tensor(weights,dtype=torch.float32))
train_dataloader = torch.utils.data.DataLoader(train_dataset,batch_size=16,shuffle = True)

In [148]:
for x in dl_model.children():
    for a in x:
        if type(a) == nn.modules.linear.Linear:
            nn.init.xavier_uniform_(a.weight)
            

In [149]:
dl_model.train()
for i in tqdm(range(200)):
    for x,y in train_dataloader:

        optim.zero_grad()

        outs = dl_model(x)
        loss = loss_fn(outs,y)

        loss.backward()
        optim.step()


100%|█████████████████████████████████████████| 200/200 [00:08<00:00, 23.20it/s]


In [150]:
dl_model.eval()
with torch.no_grad():
    preds = dl_model(x_val_one_hot)
print(classification_report(y_val,preds.argmax(axis=1)))

              precision    recall  f1-score   support

           0       0.89      0.82      0.85       240
           1       0.45      0.60      0.51        60

    accuracy                           0.77       300
   macro avg       0.67      0.71      0.68       300
weighted avg       0.80      0.77      0.78       300



In [151]:
confusion_matrix(y_val,preds.argmax(axis=1))

array([[196,  44],
       [ 24,  36]])

## KNN Model

In [69]:
knn = KNeighborsClassifier(n_neighbors=3)
  
knn.fit(x_train_one_hot, y_train)
  


In [70]:
print(classification_report(y_train,knn.predict_proba(x_train_one_hot).argmax(axis=1)))

              precision    recall  f1-score   support

           0       0.89      0.98      0.94       315
           1       0.91      0.56      0.70        85

    accuracy                           0.90       400
   macro avg       0.90      0.77      0.82       400
weighted avg       0.90      0.90      0.89       400



In [71]:
print(classification_report(y_val,knn.predict_proba(x_val_one_hot).argmax(axis=1)))

              precision    recall  f1-score   support

           0       0.83      0.92      0.87       236
           1       0.49      0.28      0.36        64

    accuracy                           0.78       300
   macro avg       0.66      0.60      0.61       300
weighted avg       0.75      0.78      0.76       300



In [72]:
confusion_matrix(y_val,knn.predict_proba(x_val_one_hot).argmax(axis=1))

array([[217,  19],
       [ 46,  18]])

# Testing

In [155]:
x_test,y_test = test_dataset[:,:-1],test_dataset[:,-1]

In [156]:
x_test_one_hot = categorizer(x_test,)

## Naive Bayes Results

In [157]:
print(classification_report(y_test,nb_model.predict_proba(x_test_one_hot).argmax(axis=1)))

              precision    recall  f1-score   support

           0       0.89      0.99      0.93       240
           1       0.91      0.50      0.65        60

    accuracy                           0.89       300
   macro avg       0.90      0.74      0.79       300
weighted avg       0.89      0.89      0.88       300



In [158]:
confusion_matrix(y_test,nb_model.predict_proba(x_test_one_hot).argmax(axis=1))

array([[237,   3],
       [ 30,  30]])

## DecisionTree Results

In [159]:
print(classification_report(y_test,clf.predict_proba(x_test_one_hot).argmax(axis=1)))


              precision    recall  f1-score   support

           0       0.84      0.99      0.91       240
           1       0.88      0.23      0.37        60

    accuracy                           0.84       300
   macro avg       0.86      0.61      0.64       300
weighted avg       0.85      0.84      0.80       300



In [160]:
confusion_matrix(y_test,clf.predict_proba(x_test_one_hot).argmax(axis=1))

array([[238,   2],
       [ 46,  14]])

## DeepLearning Results

In [161]:
dl_model.eval()
with torch.no_grad():
    preds = dl_model(x_test_one_hot)
print(classification_report(y_test,preds.argmax(axis=1)))

              precision    recall  f1-score   support

           0       0.94      0.85      0.89       240
           1       0.57      0.80      0.67        60

    accuracy                           0.84       300
   macro avg       0.76      0.82      0.78       300
weighted avg       0.87      0.84      0.85       300



In [162]:
confusion_matrix(y_val,preds.argmax(axis=1))

array([[175,  65],
       [ 41,  19]])

## KNN Results

In [163]:
print(classification_report(y_test,knn.predict_proba(x_test_one_hot).argmax(axis=1)))

              precision    recall  f1-score   support

           0       0.84      0.97      0.90       240
           1       0.65      0.25      0.36        60

    accuracy                           0.82       300
   macro avg       0.74      0.61      0.63       300
weighted avg       0.80      0.82      0.79       300



In [164]:
confusion_matrix(y_val,knn.predict_proba(x_val_one_hot).argmax(axis=1))

array([[226,  14],
       [ 51,   9]])