In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv('../input/bank-customer-churn-modeling/Churn_Modelling.csv')

In [None]:
df.head()

In [None]:
df.isnull().sum() ## Checking for missing values

In [None]:
df.info()

In [None]:
df = df.drop(['RowNumber','CustomerId','Surname'],axis=1)

In [None]:
df.head()

In [None]:
cat_features = df.select_dtypes('object')
cat_features

In [None]:
df['CreditScoreByAge'] = df['CreditScore']/df['Age']

In [None]:
gender_dummies = pd.get_dummies(df['Gender'],drop_first=True)
gender_dummies.columns=['gender_male']
df = pd.concat([df,gender_dummies],axis=1)
df = df.drop('Gender',axis=1)

In [None]:
df['Geography'] = np.where(df['Geography']=='Spain',0,1)

In [None]:
### detecting outlier

outlier=[]

def detect_outlier(dataframe,feature):
    thresh = 3
    mean = np.mean(dataframe[feature])
    std = np.std(dataframe[feature])
    
    for i in dataframe[feature]:
        z_score = (i-mean)/std
        if z_score > thresh:
            outlier.append(i)
    return outlier

In [None]:
detect_outlier(df,'Age')

In [None]:
df['Age'] = np.where(df['Age'] >=71,71,df['Age']) ##Replacing outlier with border values

In [None]:
df.head()

In [None]:
df['Salary/Age'] = df['EstimatedSalary']/df['Age']

In [None]:
df['NumOfProducts'] = np.where(df['NumOfProducts']>1,1,0) ##Multiple products (>1)

In [None]:
cat_features =['Geography','NumOfProducts',
       'HasCrCard', 'IsActiveMember', 
       'gender_male']

In [None]:
cont_features = ['Age', 'Balance','EstimatedSalary', 'CreditScoreByAge', 'Salary/Age']

In [None]:
for feature in cat_features: 
    df[feature] = np.where(df[feature]==0,-1,df[feature]) ###To decorrelate them we will replace 0 with -1

In [None]:
df.head()

In [None]:
X = df.drop(['Exited','CreditScore'],axis=1).values
y = df['Exited'].values

In [None]:
X.shape

In [None]:
y.shape

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42,stratify=y)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import SGD

In [None]:
encoder = Sequential()
encoder.add(Dense(11,activation="relu"))
encoder.add(Dense(5,activation="relu"))
encoder.add(Dense(2,activation="relu"))

In [None]:
decoder = Sequential()
decoder.add(Dense(5,activation="relu"))
decoder.add(Dense(11,activation="relu"))

In [None]:
autoencoder= Sequential([encoder,decoder])
autoencoder.compile(loss="binary_crossentropy",optimizer="SGD",metrics=["accuracy"])

In [None]:
autoencoder.fit(scaled_X_train,y_train,epochs=40,validation_data=(scaled_X_test,y_test))

In [None]:
lower_layer = Sequential()
lower_layer.add(Dense(1,activation="sigmoid"))

In [None]:
final_model = Sequential([encoder,lower_layer])
final_model.compile(loss="binary_crossentropy",optimizer="SGD",metrics=["accuracy"])

In [None]:
final_model.fit(scaled_X_train,y_train,epochs=10,validation_data=(scaled_X_test,y_test))

In [None]:
losses= final_model.history.history

In [None]:
losses = pd.DataFrame(losses)

In [None]:
losses[["loss","val_loss"]].plot()

In [None]:
predictions = final_model.predict_classes(scaled_X_test)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,roc_auc_score,roc_curve,balanced_accuracy_score

In [None]:
print(roc_auc_score(y_test,predictions))
print(confusion_matrix(predictions,y_test))

In [None]:
print(accuracy_score(y_test,predictions))

In [None]:
print(balanced_accuracy_score(predictions,y_test))

In [None]:
print(classification_report(predictions,y_test))

In [None]:
fpr,tpr,threshold  =roc_curve(predictions,y_test)

In [None]:
sns.countplot(df["Exited"])