# Author: Priyanka Prusty

# Imports

In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler, LabelEncoder
import warnings
warnings.filterwarnings("ignore")
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
import pickle
from sklearn.externals import joblib
from keras.models import load_model
import os

# Check if training is completed

In [26]:
'''
To run this notebook, training must be completed 
and the following files must be present in the same
directory as that of the test_code.ipynb
1. selected_features.txt
2. kerasModel.h5
3. scaler.pkl
These 3 files are also provides as a 'trained_model_files.zip' file in the 
canvas submission. 
Please extract the files in the same folder as this notebook is present and run this notebook again.

'''
isTrainingOver = False
# We just need to check if there is a saved keras model. If the keras model is 
# saved than the other files will be present
if(os.path.isfile('kerasModel.h5')):
    isTrainingOver = True

   # Load Data
 

In [27]:
df_judge = pd.read_csv('judge.csv')

In [28]:
df_judge

Unnamed: 0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,15605215,Stevenson,767,France,Male,48,9,0.00,2,0,1,175458.21
1,15567855,Chukwufumnanya,623,France,Female,29,1,0.00,2,0,0,39382.06
2,15780256,Palfreyman,630,France,Male,34,9,0.00,2,1,1,114006.35
3,15592229,Mullan,713,France,Female,52,0,185891.54,1,1,1,46369.57
4,15788683,Kang,588,Germany,Female,34,10,129417.82,1,1,0,153727.32
5,15693203,Powell,710,Spain,Female,75,5,0.00,2,1,1,9376.89
6,15635125,Findlay,566,Spain,Male,63,2,120787.18,2,1,1,52198.84
7,15582129,Hsia,517,France,Male,62,1,43772.66,3,1,0,187756.24
8,15703482,Walker,710,Germany,Male,34,9,134260.36,2,1,0,147074.67
9,15670738,Mazzanti,733,Germany,Male,45,2,113939.36,2,1,0,3218.71


# Preprocessing

## Handle missing data

In [29]:
#missing data in the dataset
number_of_missingdata = df_judge.isnull().sum().sort_values(ascending=False)
percent_of_missingdata = (df_judge.isnull().sum()/df_judge.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([number_of_missingdata, percent_of_missingdata], axis=1, 
                         keys=['Number_of_missingdata', 'Percent_of_missingdata'])
print(missing_data.head(20))

print("*"*50)

#total number of missing data
total=df_judge.isnull().sum().sum()
print('Total missing data=',total)


#I found no missing data in the dataset, hence I am getting 0 missing datas.




                 Number_of_missingdata  Percent_of_missingdata
EstimatedSalary                      0                     0.0
IsActiveMember                       0                     0.0
HasCrCard                            0                     0.0
NumOfProducts                        0                     0.0
Balance                              0                     0.0
Tenure                               0                     0.0
Age                                  0                     0.0
Gender                               0                     0.0
Geography                            0                     0.0
CreditScore                          0                     0.0
Surname                              0                     0.0
CustomerId                           0                     0.0
**************************************************
Total missing data= 0


## Drop irrelevant columns

In [30]:
##Save customer id for later use to create final judge-pred.csv file
df_customerId = df_judge['CustomerId']
#Drop the irrelevant columns
df_judge= df_judge.drop(['Surname','CustomerId'],axis=1)

In [31]:
## Handling catagorical variables

In [32]:
#number of catagorical variables
categorical_variables = df_judge.dtypes[df_judge.dtypes == "object"].index
catg_list=df_judge[categorical_variables]
numerical_variables = df_judge.dtypes[df_judge.dtypes != "object"].index

print("Number of Categorical features: ", len(categorical_variables))
print("*"*50)
print(catg_list.head(3))
print("*"*50)   


#introduce dummy variables    
df_judge = pd.get_dummies(df_judge,categorical_variables)

Number of Categorical features:  2
**************************************************
  Geography  Gender
0    France    Male
1    France  Female
2    France    Male
**************************************************


In [33]:
df_judge

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,767,48,9,0.00,2,0,1,175458.21,1,0,0,0,1
1,623,29,1,0.00,2,0,0,39382.06,1,0,0,1,0
2,630,34,9,0.00,2,1,1,114006.35,1,0,0,0,1
3,713,52,0,185891.54,1,1,1,46369.57,1,0,0,1,0
4,588,34,10,129417.82,1,1,0,153727.32,0,1,0,1,0
5,710,75,5,0.00,2,1,1,9376.89,0,0,1,1,0
6,566,63,2,120787.18,2,1,1,52198.84,0,0,1,0,1
7,517,62,1,43772.66,3,1,0,187756.24,1,0,0,0,1
8,710,34,9,134260.36,2,1,0,147074.67,0,1,0,0,1
9,733,45,2,113939.36,2,1,0,3218.71,0,1,0,0,1


## Select same features that was used in the best model 

In [34]:
selected_feats = []

# open selected feature file read the contents into a list
if(isTrainingOver):
    with open('selected_features.txt', 'r') as filehandle:
        for line in filehandle:
            # remove linebreak which is the last character of the string
            feature = line[:-1]
            # add item to the list
            selected_feats.append(feature)
else:
    print('Please run the traing_code.ipynb first or extract the trained_model_files.zip files in the same directory')

In [35]:
selected_feats

['Exited',
 'Age',
 'Geography_Germany',
 'IsActiveMember',
 'Balance',
 'Geography_France',
 'Gender_Female',
 'Gender_Male',
 'Geography_Spain',
 'NumOfProducts',
 'CreditScore',
 'Tenure']

In [36]:
df_judge_final=df_judge.filter(items=selected_feats)

In [38]:
df_judge_final

Unnamed: 0,Age,Geography_Germany,IsActiveMember,Balance,Geography_France,Gender_Female,Gender_Male,Geography_Spain,NumOfProducts,CreditScore,Tenure
0,48,0,1,0.00,1,0,1,0,2,767,9
1,29,0,0,0.00,1,1,0,0,2,623,1
2,34,0,1,0.00,1,0,1,0,2,630,9
3,52,0,1,185891.54,1,1,0,0,1,713,0
4,34,1,0,129417.82,0,1,0,0,1,588,10
5,75,0,1,0.00,0,1,0,1,2,710,5
6,63,0,1,120787.18,0,0,1,1,2,566,2
7,62,0,0,43772.66,1,0,1,0,3,517,1
8,34,1,0,134260.36,0,0,1,0,2,710,9
9,45,1,0,113939.36,0,0,1,0,2,733,2


# Standardization

In [39]:
##Scale the data from saved scaler that was fit while constructing the best model
if(isTrainingOver):
    scaler = joblib.load('scaler.pkl')
    X_judge = scaler.transform(df_judge_final)
else:
    print('Please run the traing_code.ipynb first or extract the trained_model_files.zip files in the same directory')

In [41]:
X_judge[0]

array([-6.23002387e+00, -3.68961466e+00, -1.39776873e+00, -1.22607103e+00,
       -9.03802018e-01, -1.02703190e+00,  9.90049504e-01, -5.79060940e-01,
        4.08943371e+00,  1.53973390e+03,  1.69804617e+01])

## Load the saved model from test_code.ipynb

In [11]:

##Read the model from model file
if(isTrainingOver):
    best_ann_model = load_model('kerasModel.h5')
else:
    print('Please run the traing_code.ipynb first or extract the trained_model_files.zip files in the same directory')








Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


## Predict

In [12]:
if(isTrainingOver):
    y_judge = best_ann_model.predict_classes(X_judge)
else:
    print('Please run the traing_code.ipynb first or extract the trained_model_files.zip files in the same directory')

In [13]:
#Save Judge prediction in a file
if(isTrainingOver):
    customerId_list = df_customerId.values.tolist()
    file_handler = open('judge-pred3.csv','w')
    file_handler.write('CustomerID,Exited\n')
    for i in range (0, len(y_judge)):
        file_handler.write('%d,%d\n' %(customerId_list[i], y_judge[i]))
    file_handler.close()
else:
    print('Please run the traing_code.ipynb first or extract the trained_model_files.zip files in the same directory')