In [None]:
## I am doing improvement base on the https://www.kaggle.com/duttasd28/churn-modelling-neural-nets-smote-xgboost. 

> __Churn Modelling__ : This problem is usually concerned with predicting whether a customer
is going to stay with a company or not. This is very useful financially for companies as it
helps them to target customer groups.
This is typically a binary classification problem. That is you need to look at the data and
predict whether the person will be __exiting(1)__ or the person will __stay(0)__.

In this dataset, we are going to use multiple approaches to this problem of churn modelling

# Initial Setup

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Import the data
data = pd.read_csv('/kaggle/input/churn-modelling/Churn_Modelling.csv', index_col = 'RowNumber')
data.head()

As we can see, **Exited** is our dependent feature. Other columns are independent features

Let us check how many values of __Exited__ columns are there so that we can figure out if there is class imbalance or not

In [None]:
data['Exited'].value_counts()

There are about 8000 examples of '0' and 2000 examples of '1'. This indicates that we have severe class imbalance.
This means that if we have a simple naive classifier that predicts 0 all the time, we can easily achieve 80% accuracy.

So, we need to generate additional synthetic samples for our dataset so that modelling is effective.

# Data Preprocessing
In this step, we are going to preprocess our data so that we can use it on our models.

Preprocessing involves the following:
* Checking for NaN values that is missing values in the data
* Visualise the data so that we can derive meaningful insights
* Split to training and test datasets
* Fill in NaN Values
* Convert non numeric features to numeric features so that we can do predictions
* Scale the data 

Let us go ahead with the first step, __checking for NaN/missing values__

# Checking for Missing Values(NaN)

In [None]:
# check for missing values
data.isnull().any()

Phew! We are lucky we did not get any null values. Usually there are null values in the dataset and we need to remove them

# Data Visualization
Here we are going to plot graphs regarding the data to get a deeper insight.

In [None]:
# Import necessary plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Make figures inline
%matplotlib inline

Let us get a list of columns in the data so that we can predict better. 
We use the .info() method to get the datatypes too

In [None]:
data.info()

**Geography, Gender, Surname** are object data-types, while others are either int / float.

In [None]:
plt.figure(figsize=(8, 8))
sns.set()
sns.boxplot(y = 'CreditScore', x = 'Exited', data = data, palette = 'husl');

In [None]:
plt.figure(figsize=(8, 5))
sns.violinplot(y = 'Exited' , x = 'Gender' , data = data, kind='boxen', palette = 'hot');

In [None]:
plt.figure(figsize=(8, 5))
sns.countplot(x = 'Geography' , data = data);

Let us plot a heatmap of the correlations of the features with each other. That will help us discard non useful features.
It also gives us some idea as to what features predict dependent column best

In [None]:
plt.figure(figsize=(10, 10))
sns.set(style = 'white')
sns.heatmap(data.select_dtypes(include='number').corr(), annot = True, cmap = 'magma', square = True);

Pairplot - This plots graphs between every two variables. This is useful for visualisation

In [None]:
# Pairplot
plt.figure(figsize=(12, 8))
sns.pairplot(data = data, corner = True, hue = 'Exited');

# Converting non numeric features to numeric features
We convert non numeric features to numeric features.
Also we drop columns which do not seem to contribute anything useful like **CustomerId**, **Surname**.

But first we will split the dataset into train and test dataset.

In [None]:
# Drop a useless feature
data.drop(['CustomerId', 'Surname'], axis = 1, inplace = True)

In [None]:
# Get dependent and independent features
X = data.iloc[:, :-1]
y = data.iloc[:, -1].astype('float')
X.head()

In [None]:
# Splitting to train test dataset
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.25, random_state = 1)
len(y_train), len(y_val)

In [None]:
# Reset the indexes of the splitted data frames
X_train.reset_index(drop=True, inplace=True)
X_val.reset_index(drop=True, inplace=True)

y_train.reset_index(drop=True, inplace=True)
y_val.reset_index(drop=True, inplace=True)

In [None]:
categorical_cols = [col for col in X_train.columns if X_train[col].dtypes == object]

In [None]:
# Label encoder object
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

# Create two empty data frames
X_train_categorical, X_val_categorical = pd.DataFrame(), pd.DataFrame()

# Label Encode the features
for col in categorical_cols:
    X_train_categorical[col] = label_encoder.fit_transform(X_train[col])
    X_val_categorical[col] = label_encoder.transform(X_val[col])

# Drop the non required columns
X_train.drop(categorical_cols, axis = 1, inplace = True)
X_val.drop(categorical_cols, axis = 1, inplace=True)

# put new colums in dataframe
X_train = X_train.join(X_train_categorical)
X_val = X_val.join(X_val_categorical)

# Generating new data by oversampling
Since we have an imbalanced dataset, we will increase the number of samples by SMOTE technique

In [None]:
from imblearn.over_sampling import SMOTE
smk = SMOTE()
# Oversample training  data
X_train, y_train = smk.fit_sample(X_train, y_train)

# Oversample validation data
X_val, y_val = smk.fit_sample(X_val, y_val)

# Final check at the dataset before putting in model
Now we take a final look at the dataset

In [None]:
X_train.shape, X_val.shape

In [None]:
X_train[:5]

In [None]:
y_train.value_counts()

# Scaling
We scale the data so that datapoints are on the same level

### Note: we have labelled data, so we should not scale all the data.Otherwise meaning will be lost

In [None]:
columns = ['CreditScore', 'Balance', 'EstimatedSalary', 'Age']  ## Columns to modify

## Subtract the mean, divide by standard deviation.
for col in columns:
    colMean = X_train[col].mean()
    colStdDev = X_train[col].std()
    X_train[col] = X_train[col].apply(lambda x : (x - colMean) / colStdDev)
    X_val[col] = X_val[col].apply(lambda x : (x - colMean) / colStdDev)    

In [None]:
X_train.head()

# Models
We will be using the following models 
* Logistic Regression
* Decision Tree
* Random Forest Classifier
* Extra Trees Classifier
* XGBClassifier
* ANN

In [None]:
# metric
from sklearn.metrics import f1_score

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver = 'lbfgs', max_iter = 300)

# fit the data
model.fit(X_train, y_train)

# Get predictions
y_preds = model.predict(X_val)

# Get score
f1_score(y_preds, y_val)

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()

# fit the data
model.fit(X_train, y_train)

# Get predictions
y_preds = model.predict(X_val)

# Get score
f1_score(y_preds, y_val)

# Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()

# fit the data
model.fit(X_train, y_train)

# Get predictions
y_preds = model.predict(X_val)

# Get score
f1_score(y_val, y_preds)

# Extra Trees Classifier

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier()

# fit the data
model.fit(X_train, y_train)

# Get predictions
y_preds = model.predict(X_val)

# Get score
f1_score(y_val, y_preds)

# XGBoost

In [None]:
from xgboost import XGBClassifier

model = XGBClassifier()

# fit the data
model.fit(X_train, y_train)

# Get predictions
y_preds = model.predict(X_val)

# Get score
f1_score(y_val, y_preds)


In [None]:
import xgboost as xgb


X_train = X_train
y_train = y_train
X_valid = X_val
y_valid = y_val


params = {"objective":"binary:logistic",'colsample_bytree': 0.9,'learning_rate': 0.05,
                'max_depth': 7, 'eval_metric':'auc'}

dtrain = xgb.DMatrix(data=X_train.values,feature_names=X_train.columns,label=y_train.values)
dvalid = xgb.DMatrix(data=X_valid.values,feature_names=X_valid.columns,label=y_valid.values)

mod = xgb.train(params=params,
                dtrain=dtrain,
                num_boost_round=1000,
                early_stopping_rounds=50,
                evals=[(dvalid,'valid'), (dtrain,'train')],
                verbose_eval=20)


In [None]:
# Get predictions
from numpy import argmax
from sklearn.metrics import roc_auc_score
y_preds = mod.predict(dvalid)

threshold = []
f1score = []
# Get score
for i in np.arange(0.1,0.4,0.005):
    threshold.append(i)
    #print(f1_score(y_val, y_preds>i))
    f1score.append(f1_score(y_val, y_preds>i))
plt.plot(threshold,f1score)
ix = argmax(f1score)
print('best threshold = %f' %threshold[ix])
print('best f1score = %f' %f1score[ix])
xgb_roc_auc_score = roc_auc_score(y_val,y_preds)
print('best AUCROC = %f' %xgb_roc_auc_score)