In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **What is "Unbalanced Dataset"**
# 
In simple terms, an unbalanced dataset is one in which the target variable has more observations in one specific class than the others. . Besides, the problem is that models trained on unbalanced datasets often have poor results when they have to generalize (predict a class or classify unseen observations

# Methods to tackle Unbalanced Dataset: 
    1. Undersampling
    2. Oversampling
    3. SMOTE 

# Underssampling:

Undersampling techniques remove examples from the training dataset that belong to the majority class in order to better balance the class distribution, such as reducing the skew from a 1:100 to a 1:10, 1:2, or even a 1:1 class distribution. This is different from oversampling that involves adding examples to the minority class in an effort to reduce the skew in the class distribution.


# Oversampling:

Which consists in over-sizing the minority class by adding observations.Oversampling involves supplementing the training data with multiple copies of some of the minority classes. Oversampling can be done more than once (2x, 3x, 5x, 10x, etc.) This is one of the earliest proposed methods, that is also proven to be robust.Instead of duplicating every sample in the minority class, some of them may be randomly chosen with replacement.


# SMOTE :

it is an oversampling technique that generates synthetic samples from the minority class. It is used to obtain a synthetically class-balanced or nearly class-balanced training set, which is then used to train the classifier

In [None]:
import tensorflow as tf
from tensorflow import keras


df=pd.read_csv("/kaggle/input/bank-customer-churn-modeling/Churn_Modelling.csv")
df.head()

In [None]:
df.info()

In [None]:
## rownumber and customer-id wont do any good to the data
## lets drop them from the dataset

df.drop(['RowNumber','CustomerId'],axis=1,inplace=True)
df.columns

In [None]:
## are there any null values in the dataset

df.isnull().sum()

## no null values in the dataset

 ****EXITED vs ESTIMATEDSALARY

In [None]:
import matplotlib.pyplot as plt

churn_no=df[df['Exited']==0].EstimatedSalary
churn_yes=df[df['Exited']==1].EstimatedSalary

plt.xlabel("Estimated_salary")
plt.ylabel("Number of customers")

plt.hist([churn_no,churn_yes],label=['exit_no','exit_yes'])
plt.legend()

# EXITED vs TENURE

In [None]:

churn_no=df[df['Exited']==0].Tenure
churn_yes=df[df['Exited']==1].Tenure

plt.xlabel("Tenure")
plt.ylabel("Number of customers")

plt.hist([churn_no,churn_yes],label=['churn_no','churn_yes'])
plt.legend()

In [None]:
## lets find out the unique values in the object columns
## since some columns are of object datatypes

def values_in_columns(df):
    for column in df:
        if df[column].dtype=='object':
            print(f'{column} : {df[column].unique()} : {df[column].nunique()}')



In [None]:
values_in_columns(df) 

## from the answer : only 3 columns are of object datatypes
## and surname has 2932 unique entries
## Geography has 3 unique entries
## Gender has 2 unique entries

## lets convert the data_types of object data type to int or float

In [None]:
print(df['Surname'].nunique())

from sklearn.preprocessing import LabelEncoder

lb = LabelEncoder()

df.Surname = lb.fit_transform(df.Surname)



In [None]:

print(df['Geography'].unique())

## geography column has just 3 unique entries
## france, spain and Germany
## lets create the dummy for these 3 entries


dummy_dataset=pd.get_dummies(data=df,columns=["Geography"])



In [None]:
df['Gender'].unique()

## lets replace 
## male --> 1
## female --> 0


dummy_dataset['Gender'] = dummy_dataset['Gender'].replace(['Male','Female'],[1,0])

dummy_dataset['Gender'].value_counts()

since the surname column have many unique entries.
Lets scale that colum so that it will be easier to carry out the neural network on scaled values

In [None]:
dummy_dataset.head()

**since we can see values in some columns are not normalized.
if not normalized : it will create hinderance in the dataset
lets normalize the dataset**

In [None]:
## lets find out the unique values in the object columns
## since some columns are of object datatypes

def values_in_columns(df):
    for column in df:
            print(f'{column} : {df[column].unique()} : {df[column].nunique()}')

values_in_columns(dummy_dataset)

In [None]:

scaling_cols=['CreditScore','Age','Tenure','Balance','NumOfProducts','EstimatedSalary']

from sklearn.preprocessing import  MinMaxScaler

scaler=MinMaxScaler()

dummy_dataset[scaling_cols] = scaler.fit_transform(dummy_dataset[scaling_cols])


dummy_dataset.drop("Surname",axis=1,inplace=True)  ## many unique values

In [None]:
dummy_dataset['Exited'].value_counts()

# SPLITTING THE DATA

In [None]:
from sklearn.model_selection import train_test_split

x=dummy_dataset.drop("Exited",axis=1)
y=dummy_dataset['Exited']


x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=1)

In [None]:
x_train.shape, x_test.shape


## because of 13 columns in the dataste
## it gave us the idea that 13 neurons should be there in the final dataset

# Building ANN Model

In [None]:
from tensorflow import keras

model = keras.Sequential([
    
    ## reshaping the input entries
    keras.layers.Dense(12, input_shape=(12,), activation='relu'),  
    keras.layers.Dropout(0.50),    ## to avoid overfitting and underfiting

    ## creating the hidden layer
    keras.layers.Dense(10,activation='relu'),
    keras.layers.Dropout(0.70),    ##  to avoid overfitting and underfiting
    
    keras.layers.Dense(150,activation='relu'),
    keras.layers.Dropout(0.70),     ## to avoid overfitting and underfiting
 
    
    ## final neural layer
    keras.layers.Dense(1,activation='sigmoid')
    
])


model.compile(optimizer='SGD',
             loss='binary_crossentropy',
             metrics=['accuracy'])

In [None]:
model.fit(x_train,y_train,epochs=20)

In [None]:
model.evaluate(x_test,y_test)

In [None]:
y_pred=model.predict(x_test)

y_predicted=[]

for i in y_pred:
    if i>=0.5:
        y_predicted.append(1)
    else:
        y_predicted.append(0)
        
        
from sklearn.metrics import classification_report

print(classification_report(y_predicted,y_test))        

Model Accuracy is good but the f1 score is not good for 1_th prediction class.

it is coming to be zero and 88% for 0_th class

In [None]:
dummy_dataset['Exited'].value_counts()



## clearly there is the imbalance in the dataset

***lets create a common function of neural network which we will be using for prediction***

In [None]:
def ANN(xtrain,ytrain,xtest,ytest,loss,weights):
    model=keras.Sequential([
    keras.layers.Dense(12,input_shape=(12,),activation='relu'),
    
    keras.layers.Dense(15,activation='relu'), ## hidden layer
    keras.layers.Dropout(0.5),    
    
    keras.layers.Dense(1,activation='sigmoid'),

    ## use sigmoid since only we have to predict between 0 and 1
    ## only 1 neuron in last step since again prediction between 0 and 1
      
    ])

    model.compile( optimizer="adam",
    loss=loss, ## will be defined in the syntax of the function
    metrics=['accuracy'])
    
    if weights==-1:
        model.fit(xtrain,ytrain,epochs=100)
    else:
        model.fit(xtrain,ytrain,epochs=100,class_weight=weights)
     
    
    print('#############################################################################')
    print('########### ##################################################################')
    print('Model Accuracy is :')
    print(model.evaluate(xtest,ytest))
    
    print('#############################################################################')
    print('#############################################################################')
    
    print("\n")
    ypred3=model.predict(xtest)
    ypred_ann = np.round(ypred3)
    
    print('#############################################################################')
    print('#############################################################################')
    
    print('#############################################################################')
    print("classification report is \n")
    print(classification_report(ytest,ypred_ann))
    print('#############################################################################')

    
    
    return ypred_ann

In [None]:
ypred_ann=ANN(x_train,y_train,x_test,y_test,'binary_crossentropy',-1)

# **scrool down in the epoch dialog box to see the classification report**

**from the classification report :
0_th class has : 0.92 as f1 score
1_th class has : 0.53 as f1 score**

 Lets try to improve the f1 scores for boh the classes

# METHOD 1 : UNDERSAMPLING

In [None]:
df_class_0, df_class_1 = df['Exited'].value_counts()

df_class_0, df_class_1

## both store the frequency of 0th and 1th class

In [None]:
dummy_class_0 = dummy_dataset[dummy_dataset['Exited']==0]
dummy_class_1 = dummy_dataset[dummy_dataset['Exited']==1]


## dummy_class_0 stores the entries with exited as 0
## dummy_class_1 stores the entries with exited as 1


In [None]:
## we will take 2037 samples from 0th row so as their are equal quantities of 0 and 1

dummy_class_0_under=dummy_class_0.sample(df_class_1)

dummy_class_0_under.shape

In [None]:
df_test_under=pd.concat([dummy_class_0_under,dummy_class_1],axis=0)

df_test_under['Exited'].value_counts()

# so 0 and 1 have same number of columns

# splitting the data

In [None]:
x=df_test_under.drop("Exited",axis=1)
y=df_test_under['Exited']

## stratify will clubbed the samples in the same proportion as that of the inputs feed in to the sytax

xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2,random_state=42,stratify=y)


ytrain.value_counts()

In [None]:
ypred=ANN(xtrain,ytrain,xtest,ytest,'binary_crossentropy',-1)

In [None]:
## scrool down in the epoch dialog box answer to see the confusion matrix


## model accuracy : 79.38 %
## F1 score  : 0_th class : 0.80
## f1 score  : 1_th class : 0.79 

# METHOD 2 : OVERSAMPLING

In [None]:
df_class_0, df_class_1
## class 0 has more sample

In [None]:
## in this we are just oversampling the data
## that is we are just duplicating the minority sample
## so as to match with the size of majaority class

In [None]:
dummy_class_1_over=dummy_class_1.sample(df_class_0,replace=True)

dummy_class_1_over.shape



## replace because original lenght of df-class-1 is 2037
## so just to duplicate : we will use replace=True

In [None]:

df_test_over=pd.concat([dummy_class_0,dummy_class_1_over],axis=0)

df_test_over.shape

In [None]:
df_test_over['Exited'].value_counts()

In [None]:

## lets run the model

x=df_test_over.drop("Exited",axis=1)
y=df_test_over['Exited']


xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2,random_state=42,stratify=y)

ypred=ANN(xtrain,ytrain,xtest,ytest,'binary_crossentropy',-1)


## stratify will keep the proportion of x and y as same as that of in the dataset 

In [None]:
## scrool down in the epoch dialog box answer to see the confusion matrix


## model accuracy : 77.06 %
## F1 score  : 0_th class : 0.77
## f1 score  : 1_th class : 0.77 

# METHOD 3 : SMOTE METHOD

In [None]:
x=dummy_dataset.drop('Exited',axis=1)
y=dummy_dataset['Exited']

In [None]:
pip install imbalanced-learn

In [None]:
from imblearn.over_sampling import SMOTE

smote=SMOTE(sampling_strategy='minority')
x_sm,y_sm=smote.fit_sample(x,y)    



y_sm.value_counts()

## equal number of samples  for 0 and 1

In [None]:

xtrain,xtest,ytrain,ytest=train_test_split(x_sm,y_sm,test_size=0.2,random_state=42,stratify=y_sm)

ytrain.value_counts()

## ytrain also has equal nuber of samples


## stratify will keep the proportion of x and y as same as that of in the dataset

In [None]:
## lets run the model

ypred=ANN(xtrain,ytrain,xtest,ytest,'binary_crossentropy',-1)

In [None]:
## scrool down in the epoch dialog box answer to see the confusion matrix


## model accuracy : 80.04 %
## F1 score  : 0_th class : 0.80
## f1 score  : 1_th class : 0.80

# So "SMOTE" method gives the good accuracy as well as the good f1 scores compared to other methods

# Please upvote the kernel if you liked my work.