In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#Modules for EDA
import numpy as np 
import pandas as pd 
from matplotlib import pyplot as plt
import seaborn as sns
plt.style.use('seaborn')

#Modules for Machine Learning
from tensorflow import keras
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.model_selection import train_test_split

from imblearn.over_sampling import SMOTE
%matplotlib inline

In [None]:
df = pd.read_csv('../input/deep-learning-az-ann/Churn_Modelling.csv')
df.shape

In [None]:
df.info()

**No NaN values**

In [None]:
df.dtypes

In [None]:
df.dtypes[df.dtypes == 'object']

# **EDA**

In [None]:
df.head()

**Dropping RowNumber, CustomerID and Surname Column**

In [None]:
df.drop(['RowNumber','CustomerId','Surname'],axis=1,inplace=True)
df.head()

# **Exited Ratio**

In [None]:
df['Exited'].value_counts().plot(kind='pie',autopct='%.2f')
plt.title("Exited Ratio")
plt.xlabel(f"Total: {df['Exited'].value_counts().sum()}")
plt.show()

**Data is imbalanced :(**

# **NumOfProducts, HasCrCard and IsActiveMember**

In [None]:
category_data = ['NumOfProducts','HasCrCard','IsActiveMember']
for data in category_data:
    d = df[[data, 'Exited']]
    sns.countplot(data=d,x=d[data],hue='Exited')
    plt.title(f"{data} data with respect to Exited")
    plt.show()

# **Age and Exited**

In [None]:
sns.histplot(data=df[['Age','Exited']],x='Age',hue='Exited',element='poly')
plt.title("Age Group")
plt.show()

# **Credit Score**

In [None]:
sns.histplot(data=df[['CreditScore','Exited']],x='CreditScore',hue='Exited',element='poly')
plt.title("CreditScore")
plt.show()

# **Country**

In [None]:
sns.countplot(data=df[['Geography','Exited']],x='Geography')
plt.title("Countries Frequency")
plt.show()

# **Countries Exit Ratio**

In [None]:
sns.countplot(data=df[['Geography','Exited']],x='Geography',hue='Exited')
plt.title('Countries Exit Ratio')
plt.show()

# **Balance vs EstimatedSalary**

In [None]:
sns.histplot(data=df[['Balance','Exited']],x='Balance',hue='Exited',element='poly')
plt.show()

In [None]:
sns.histplot(data=df[['EstimatedSalary','Exited']],x='EstimatedSalary',hue='Exited',element='poly')
plt.show()

In [None]:
sns.relplot(data=df[['EstimatedSalary','Balance','Exited']],x='Balance',y='EstimatedSalary',hue='Exited',palette='rocket')
plt.title("EstimatedSalary vs Balance")
plt.show()

# **Gender Analysis**

In [None]:
sns.countplot(x=df['Gender'])
plt.title('Gender Ratio')
plt.show()

# **Which Gender has more balance?**

In [None]:
sns.histplot(data=df[['Gender','Balance']],x='Balance',hue='Gender',element='poly')
plt.show()

# **Gender Exit Ratio**

In [None]:
sns.countplot(data=df[['Gender','Exited']],x='Gender',hue='Exited')
plt.show()

### **Females are more likely to exit than male (Vice versa)**

# **Gender Ratio in each country**

In [None]:
sns.countplot(data=df[['Geography','Gender']],x='Geography',hue='Gender')
plt.title("Gender Ratio in each country")
plt.show()

In [None]:
category_data = ['NumOfProducts','HasCrCard','IsActiveMember']
for data in category_data:
    d = df[[data, 'Gender']]
    sns.countplot(data=d,x=d[data],hue='Gender')
    plt.title(f"{data} data with respect to Gender")
    plt.show()

# **Feature Scaling**

### **One Hot Encoding**

In [None]:
cols_to_encode = ['Geography', 'Gender']
dummies = pd.get_dummies(df[cols_to_encode],drop_first=True)
dummies.sample(5)

In [None]:
#Dropping encoded columns
df.drop(cols_to_encode,axis=1,inplace=True)
df.head()

### **Feature Scaling**

In [None]:
cols_to_scale = ['CreditScore','Age','Tenure','Balance','NumOfProducts','EstimatedSalary']
scale = MinMaxScaler()
scale.fit(df[cols_to_scale])
scalled = scale.transform(df[cols_to_scale])

In [None]:
i = 0
for col in cols_to_scale:
    df[col] = scalled[:,i]
    i += 1

**Scalled data**

In [None]:
df.head()

In [None]:
new_df = pd.concat([df,dummies],axis=1)
new_df.shape

In [None]:
new_df.head()

# **Splitting and Traing**

In [None]:
x, y = new_df.drop('Exited',axis=1),new_df['Exited']
x.shape,y.shape

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

In [None]:
y_train.value_counts() #:( imbalanced

# **Model Building and predicions**

In [None]:
model = keras.Sequential([
    keras.layers.Dense(11, input_shape=(11,),activation='relu'),
    keras.layers.Dense(1,activation='sigmoid')
])

model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )

In [None]:
model.fit(x_train, y_train, epochs=25)

In [None]:
model.evaluate(x_test,y_test)

In [None]:
def predict(model,x):
    pred = model.predict(x)
    pred[pred >= 0.5] = 1
    pred[pred < 0.5] = 0
    return pred.flatten()
def report(y_true,y_pred,title=None):
    cm = confusion_matrix(y_true,y_pred)
    plt.figure(figsize=(7,7))
    sns.heatmap(cm,annot=True,fmt='g')
    plt.title(title)
    plt.show()
    print("Classification report")
    print(classification_report(y_true,y_pred))

# **Training vs Testing**

In [None]:
y_pred_test = predict(model,x_test)
y_pred_train = predict(model,x_train)

In [None]:
report(y_test,y_pred_test,'Test Data')

In [None]:
report(y_train,y_pred_train,'Train Data')

# **Overfitting :(**

## **Oversampling the data using SMOTE**

In [None]:
smote = SMOTE(sampling_strategy='minority')
smote_data,target = smote.fit_resample(x,y)
smote_data.shape

In [None]:
target.shape

In [None]:
target.value_counts().plot(kind='pie',autopct="%.2f")
plt.show()

In [None]:
x_train2, x_test2, y_train2, y_test2 = train_test_split(smote_data, target, test_size=0.3)
x_train2.shape, x_test2.shape, y_train2.shape, y_test2.shape

In [None]:
def mymodel():
    model = keras.Sequential([
        keras.layers.Dense(11, input_shape=(11,),activation='relu'),
        keras.layers.Dense(1,activation='sigmoid')
    ])

    model.compile(
            optimizer=tf.optimizers.Adam(learning_rate=0.01),
            loss='binary_crossentropy',
            metrics=['accuracy']
        )
    return model

In [None]:
model2 = mymodel()
model2.fit(x_train2,y_train2,epochs=100,batch_size=64)

In [None]:
model2.evaluate(x_test2,y_test2)

In [None]:
report(y_test2,predict(model2,x_test2),'Test 2 Data')

# **Overfitting problem solved but not Accuracy**

In [None]:
report(y_train2,predict(model2,x_train2),'Train 2 Data')