In [25]:
import numpy as np
import matplotlib.pyplot as plt
import time
import utils
import pandas as pd
import gc

from mlrose_hiive.algorithms.decay import ExpDecay
from mlrose_hiive.neural import NeuralNetwork

from sklearn.metrics import log_loss, classification_report, accuracy_score, f1_score
from sklearn.model_selection import train_test_split

class color:
    PURPLE = '\033[95m'
    CYAN = '\033[96m'
    DARKCYAN = '\033[36m'
    BLUE = '\033[94m'
    GREEN = '\033[92m'
    YELLOW = '\033[93m'
    RED = '\033[91m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    END = '\033[0m'

In [26]:
#stratified sampling in sklearn
def stratified_sampling(vegito, target, size = 0.4, seed = 903949505):
    gc.collect()
    goku, vegita = train_test_split(vegito, test_size=size, stratify=vegito[target], random_state=seed)
    print(color.BOLD + color.UNDERLINE + color.DARKCYAN + "Original distribution" + color.END)
    c = vegito[target].value_counts(normalize=False)
    p = vegito[target].value_counts(normalize=True)*100
    display(pd.concat([c,p], axis=1, keys=['counts', '%']))
    print("\n" + "\n" + color.BOLD + color.UNDERLINE + color.WARNING + "Train distribution" + color.END)
    c = goku[target].value_counts(normalize=False)
    p = goku[target].value_counts(normalize=True)*100
    display(pd.concat([c,p], axis=1, keys=['counts', '%']))
    print("\n" + "\n" + color.BOLD + color.UNDERLINE + color.RED + "Test distribution" + color.END)
    c = vegita[target].value_counts(normalize=False)
    p = vegita[target].value_counts(normalize=True)*100
    display(pd.concat([c,p], axis=1, keys=['counts', '%']))
    
    #split goku, vegita and cell by target variable to make depent and indepent data seperately
    goku_X = goku.drop(target, axis=1)
    vegita_X = vegita.drop(target, axis=1)
    goku_y = pd.DataFrame(goku[target])
    vegita_y = pd.DataFrame(vegita[target])
    
    #return goku_X, goku_y, vegita_X, vegita_y
    print("\n" + "\n" + color.BOLD + color.UNDERLINE + color.BLUE + "Train_x shape" + color.END)
    print("Number of columns are " + color.BOLD + str(goku_X.shape[0]) + color.END + " and number of rows are " + color.BOLD + str(goku_X.shape[1]) + color.END)
    display(goku_X.head())
    print("\n" + "\n" + color.BOLD + color.UNDERLINE + color.BLUE + "Train_y shape" + color.END)
    print("Number of columns are " + color.BOLD + str(goku_y.shape[0]) + color.END + " and number of rows are " + color.BOLD + str(goku_y.shape[1]) + color.END)
    display(goku_y.head())
    print("\n" + "\n" + color.BOLD + color.UNDERLINE + color.BLUE + "Train_x shape" + color.END)
    print("Number of columns are " + color.BOLD + str(vegita_X.shape[0]) + color.END + " and number of rows are " + color.BOLD + str(vegita_X.shape[1]) + color.END)
    display(vegita_X.head())
    print("\n" + "\n" + color.BOLD + color.UNDERLINE + color.BLUE + "Train_y shape" + color.END)
    print("Number of columns are " + color.BOLD + str(vegita_y.shape[0]) + color.END + " and number of rows are " + color.BOLD + str(vegita_y.shape[1]) + color.END)
    display(vegita_y.head())
    return goku_X, goku_y, vegita_X, vegita_y

In [40]:
def plot_nn_perf(trainX, testX, trainY, testY, random_seeds, **kwargs):
    
    gc.collect()
    # Initialize algorithms, corresponding acronyms and max number of iterations
    algorithms = ['random_hill_climb', 'simulated_annealing', 'genetic_alg', 'gradient_descent']
    max_iters = ['rhc_max_iters', 'sa_max_iters', 'ga_max_iters', 'gd_max_iters']

    # Define SA exponential decay schedule
    exp_decay = ExpDecay(init_temp=kwargs['init_temp'],
                         exp_const=kwargs['exp_decay_rate'],
                         min_temp=kwargs['min_temp'])

    Output = pd.DataFrame(columns=['i', 'algorithm', 'random_seed', 'Itterations', 'train_accuracy', 'val_acccuracy', 'train_times', 'test_accuracy', 'test_F1'])
    
    # For each of the optimization algorithms to test the Neural Network with
    for i, algorithm in enumerate(algorithms):
        gc.collect()
        print('\nAlgorithm = {}'.format(algorithm))

        # Initialize training losses, validation losses and training time lists for current random run
        for random_seed in random_seeds:
            
            gc.collect()
            # Compute stratified k-fold
            x_train_fold, x_val_fold, y_train_fold, y_val_fold = train_test_split(trainX, trainY, test_size=0.2, shuffle=True,random_state=random_seed, stratify=trainY)
            
            # For each max iterations to run for
            for max_iter in kwargs[max_iters[i]]:

                # Define Neural Network using current algorithm
                nn = NeuralNetwork(hidden_nodes=[32,], activation='relu',
                                   algorithm=algorithm, max_iters=int(max_iter),
                                   bias=True, is_classifier=True, learning_rate=0.001,
                                   early_stopping=False, clip_max=1e10, schedule=exp_decay,
                                   pop_size=kwargs['pop_size'], mutation_prob=kwargs['mutation_prob'],
                                   max_attempts=int(max_iter), random_state=random_seed, curve=True)

                # Train on current training fold and append training time
                start_time = time.time()
                nn.fit(x_train_fold, y_train_fold)
                train_times = time.time() - start_time
                
                # Compute and append training and validation log losses
                train_accuracy = accuracy_score(y_train_fold, pd.DataFrame(nn.predict(x_train_fold), index=y_train_fold.index))
                val_acccuracy = accuracy_score(y_val_fold, pd.DataFrame(nn.predict(x_val_fold), index=y_val_fold.index))
                test_acccuracy = accuracy_score(testY, pd.DataFrame(nn.predict(testX), index=testY.index))
                test_f1_score = f1_score(testY, pd.DataFrame(nn.predict(testX), index=testY.index))
                
                print('{} - train loss = {:.3f}, val loss = {:.3f}'.format(max_iter, train_accuracy, val_acccuracy))
                Output = Output.append({'i' : i, 'algorithm' : algorithm, 'random_seed' : random_seed, 'Itterations' : max_iter, 'train_accuracy' : train_accuracy, 'val_acccuracy' : val_acccuracy, 'train_times' : train_times, 'test_accuracy' : test_acccuracy, 'test_F1' : test_f1_score}, ignore_index=True)
            display(Output)
                
    return Output
        

In [42]:
def neural_network(trainX, testX, trainY, testY):
    # Maximum iterations to run the Neural Network for
#     iterations = np.arange(1000, 10001, 1000)
    iterations = np.arange(1000, 10001, 5000)
#     random_seeds = np.array([7, 123, 10000, 99999])
    random_seeds = np.array([7, 123])

    # Plot performances for RHC, SA, GA and GD with Neural Networks
    plot_nn_perf(trainX, testX, trainY, testY,
                            random_seeds=random_seeds,
                            rhc_max_iters=iterations, sa_max_iters=iterations,
                            ga_max_iters=iterations, gd_max_iters=iterations,
                            init_temp=100, exp_decay_rate=0.1, min_temp=0.001,
                            pop_size=100, mutation_prob=0.2)

In [29]:
data = pd.read_csv("C:/Users/sagni/Documents/Personal Files/CS7641/CS7641/Data/Bank_churn/train.csv", index_col=0)
data.drop(['Surname','CustomerId'], axis=1, inplace=True)
data['HasCrCard'] = data['HasCrCard'].astype(int)
data['Age'] = data['Age'].astype(int)
data['IsActiveMember'] = data['IsActiveMember'].astype(int)
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
transformer = make_column_transformer((OneHotEncoder(handle_unknown='ignore'),['Geography','Gender']))
# Transforming
transformed = transformer.fit_transform(data.drop(['Exited'], axis=1))
print(transformed.dtype)
# Transformating back
transformed_df = pd.DataFrame(transformed, columns=transformer.get_feature_names())
# One-hot encoding removed an index. Let's put it back:
transformed_df.index = data.index
# Joining tables
data = pd.concat([data, transformed_df], axis=1)
# Dropping old categorical columns
data.drop(['Geography','Gender'], axis=1, inplace=True)
# CHecking result
data.head()
trainX, trainY, testX, testY = stratified_sampling(data, 'Exited', 0.9, 903949505)

# join testX and testY on index to create a daframe data
data = pd.concat([testX, testY], axis=1)
testX, testY, ignoreX, ignoreY = stratified_sampling(data, 'Exited', 0.5, 903949505)
#delete pandas dataframe ignoreX, ignoreY
del ignoreX
del ignoreY
del data

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(trainX)
trainX = pd.DataFrame(scaler.transform(trainX), columns=trainX.columns, index=trainX.index)
testX = pd.DataFrame(scaler.transform(testX), columns=testX.columns, index=testX.index)
display(trainX.head())
display(testX.head())

float64
[1m[4m[36mOriginal distribution[0m


Unnamed: 0,counts,%
0,130113,78.840118
1,34921,21.159882




[1m[4m[93mTrain distribution[0m


Unnamed: 0,counts,%
0,13011,78.840211
1,3492,21.159789




[1m[4m[91mTest distribution[0m


Unnamed: 0,counts,%
0,117102,78.840107
1,31429,21.159893




[1m[4m[94mTrain_x shape[0m
Number of columns are [1m16503[0m and number of rows are [1m13[0m


Unnamed: 0_level_0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,onehotencoder__x0_France,onehotencoder__x0_Germany,onehotencoder__x0_Spain,onehotencoder__x1_Female,onehotencoder__x1_Male
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
3638,669,32,7,0.0,2,1,0,93982.02,1.0,0.0,0.0,0.0,1.0
159233,672,29,9,0.0,2,1,1,134794.02,1.0,0.0,0.0,1.0,0.0
38435,644,32,2,0.0,2,1,0,103932.38,1.0,0.0,0.0,0.0,1.0
144063,724,24,7,142755.25,1,0,0,34231.48,0.0,1.0,0.0,0.0,1.0
52235,572,39,8,89047.74,1,1,1,178373.87,1.0,0.0,0.0,0.0,1.0




[1m[4m[94mTrain_y shape[0m
Number of columns are [1m16503[0m and number of rows are [1m1[0m


Unnamed: 0_level_0,Exited
id,Unnamed: 1_level_1
3638,0
159233,0
38435,0
144063,0
52235,0




[1m[4m[94mTrain_x shape[0m
Number of columns are [1m148531[0m and number of rows are [1m13[0m


Unnamed: 0_level_0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,onehotencoder__x0_France,onehotencoder__x0_Germany,onehotencoder__x0_Spain,onehotencoder__x1_Female,onehotencoder__x1_Male
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
65509,632,31,7,111388.18,1,0,1,173498.45,0.0,1.0,0.0,1.0,0.0
20875,641,34,8,0.0,2,0,0,124615.59,1.0,0.0,0.0,0.0,1.0
53648,726,63,6,123948.85,1,0,0,145560.38,1.0,0.0,0.0,1.0,0.0
40310,671,29,3,105229.53,1,1,1,131804.86,0.0,0.0,1.0,0.0,1.0
60496,711,40,7,0.0,2,1,0,180829.87,0.0,0.0,1.0,0.0,1.0




[1m[4m[94mTrain_y shape[0m
Number of columns are [1m148531[0m and number of rows are [1m1[0m


Unnamed: 0_level_0,Exited
id,Unnamed: 1_level_1
65509,0
20875,0
53648,1
40310,0
60496,0


[1m[4m[36mOriginal distribution[0m


Unnamed: 0,counts,%
0,117102,78.840107
1,31429,21.159893




[1m[4m[93mTrain distribution[0m


Unnamed: 0,counts,%
0,58551,78.840638
1,15714,21.159362




[1m[4m[91mTest distribution[0m


Unnamed: 0,counts,%
0,58551,78.839577
1,15715,21.160423




[1m[4m[94mTrain_x shape[0m
Number of columns are [1m74265[0m and number of rows are [1m13[0m


Unnamed: 0_level_0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,onehotencoder__x0_France,onehotencoder__x0_Germany,onehotencoder__x0_Spain,onehotencoder__x1_Female,onehotencoder__x1_Male
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
121133,675,29,6,121063.57,2,0,1,102076.92,0.0,1.0,0.0,0.0,1.0
39290,569,33,3,153058.1,1,1,1,102625.08,0.0,1.0,0.0,1.0,0.0
156352,629,32,7,137781.65,1,0,1,153921.32,1.0,0.0,0.0,1.0,0.0
77927,721,31,3,0.0,2,1,0,121151.1,1.0,0.0,0.0,1.0,0.0
93794,642,33,2,0.0,2,1,0,131736.23,1.0,0.0,0.0,1.0,0.0




[1m[4m[94mTrain_y shape[0m
Number of columns are [1m74265[0m and number of rows are [1m1[0m


Unnamed: 0_level_0,Exited
id,Unnamed: 1_level_1
121133,0
39290,1
156352,0
77927,0
93794,0




[1m[4m[94mTrain_x shape[0m
Number of columns are [1m74266[0m and number of rows are [1m13[0m


Unnamed: 0_level_0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,onehotencoder__x0_France,onehotencoder__x0_Germany,onehotencoder__x0_Spain,onehotencoder__x1_Female,onehotencoder__x1_Male
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
136760,706,37,9,0.0,1,0,1,159919.15,1.0,0.0,0.0,1.0,0.0
24160,745,36,2,114370.43,1,1,0,76582.95,0.0,1.0,0.0,0.0,1.0
52087,815,35,4,137455.99,1,1,1,184178.29,0.0,1.0,0.0,0.0,1.0
97925,802,29,6,0.0,2,0,0,166957.82,1.0,0.0,0.0,0.0,1.0
17721,646,41,4,126273.95,1,1,1,70400.86,0.0,1.0,0.0,1.0,0.0




[1m[4m[94mTrain_y shape[0m
Number of columns are [1m74266[0m and number of rows are [1m1[0m


Unnamed: 0_level_0,Exited
id,Unnamed: 1_level_1
136760,1
24160,1
52087,1
97925,0
17721,1


Unnamed: 0_level_0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,onehotencoder__x0_France,onehotencoder__x0_Germany,onehotencoder__x0_Spain,onehotencoder__x1_Female,onehotencoder__x1_Male
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
3638,0.613248,0.222222,0.7,0.0,0.333333,1.0,0.0,0.469689,1.0,0.0,0.0,0.0,1.0
159233,0.619658,0.174603,0.9,0.0,0.333333,1.0,1.0,0.673849,1.0,0.0,0.0,1.0,0.0
38435,0.559829,0.222222,0.2,0.0,0.333333,1.0,0.0,0.519465,1.0,0.0,0.0,0.0,1.0
144063,0.730769,0.095238,0.7,0.598837,0.0,0.0,0.0,0.17079,0.0,1.0,0.0,0.0,1.0
52235,0.405983,0.333333,0.8,0.373542,0.0,1.0,1.0,0.891854,1.0,0.0,0.0,0.0,1.0


Unnamed: 0_level_0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,onehotencoder__x0_France,onehotencoder__x0_Germany,onehotencoder__x0_Spain,onehotencoder__x1_Female,onehotencoder__x1_Male
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
121133,0.626068,0.174603,0.6,0.507843,0.333333,0.0,1.0,0.510183,0.0,1.0,0.0,0.0,1.0
39290,0.399573,0.238095,0.3,0.642056,0.0,1.0,1.0,0.512925,0.0,1.0,0.0,1.0,0.0
156352,0.527778,0.222222,0.7,0.577973,0.0,0.0,1.0,0.769532,1.0,0.0,0.0,1.0,0.0
77927,0.724359,0.206349,0.3,0.0,0.333333,1.0,0.0,0.605601,1.0,0.0,0.0,1.0,0.0
93794,0.555556,0.238095,0.2,0.0,0.333333,1.0,0.0,0.658552,1.0,0.0,0.0,1.0,0.0


In [None]:
Output = neural_network(trainX, testX, trainY, testY)
output_path = r'C:\Users\sagni\Documents\Personal Files\CS7641\Assignment2\NN_Images\Output_BankChurn.xlsx'
Output.to_excel(output_path, index=False)


Algorithm = random_hill_climb
1000 - train loss = 0.788, val loss = 0.788
6000 - train loss = 0.773, val loss = 0.775


Unnamed: 0,i,algorithm,random_seed,Itterations,train_accuracy,val_acccuracy,train_times,test_accuracy,test_F1
0,0,random_hill_climb,7,1000,0.78829,0.788246,14.508155,0.788393,0.000254
1,0,random_hill_climb,7,6000,0.773216,0.774614,105.532701,0.771992,0.010056


1000 - train loss = 0.788, val loss = 0.788
6000 - train loss = 0.212, val loss = 0.211


Unnamed: 0,i,algorithm,random_seed,Itterations,train_accuracy,val_acccuracy,train_times,test_accuracy,test_F1
0,0,random_hill_climb,7,1000,0.78829,0.788246,14.508155,0.788393,0.000254
1,0,random_hill_climb,7,6000,0.773216,0.774614,105.532701,0.771992,0.010056
2,0,random_hill_climb,123,1000,0.787532,0.787943,17.496232,0.788218,0.005815
3,0,random_hill_climb,123,6000,0.211635,0.211451,94.878971,0.211594,0.349281



Algorithm = simulated_annealing
1000 - train loss = 0.425, val loss = 0.428
6000 - train loss = 0.788, val loss = 0.787


Unnamed: 0,i,algorithm,random_seed,Itterations,train_accuracy,val_acccuracy,train_times,test_accuracy,test_F1
0,0,random_hill_climb,7,1000,0.78829,0.788246,14.508155,0.788393,0.000254
1,0,random_hill_climb,7,6000,0.773216,0.774614,105.532701,0.771992,0.010056
2,0,random_hill_climb,123,1000,0.787532,0.787943,17.496232,0.788218,0.005815
3,0,random_hill_climb,123,6000,0.211635,0.211451,94.878971,0.211594,0.349281
4,1,simulated_annealing,7,1000,0.424633,0.427749,23.401599,0.430526,0.377711
5,1,simulated_annealing,7,6000,0.788062,0.787337,139.08611,0.788258,0.001651


1000 - train loss = 0.788, val loss = 0.789
6000 - train loss = 0.426, val loss = 0.435


Unnamed: 0,i,algorithm,random_seed,Itterations,train_accuracy,val_acccuracy,train_times,test_accuracy,test_F1
0,0,random_hill_climb,7,1000,0.78829,0.788246,14.508155,0.788393,0.000254
1,0,random_hill_climb,7,6000,0.773216,0.774614,105.532701,0.771992,0.010056
2,0,random_hill_climb,123,1000,0.787532,0.787943,17.496232,0.788218,0.005815
3,0,random_hill_climb,123,6000,0.211635,0.211451,94.878971,0.211594,0.349281
4,1,simulated_annealing,7,1000,0.424633,0.427749,23.401599,0.430526,0.377711
5,1,simulated_annealing,7,6000,0.788062,0.787337,139.08611,0.788258,0.001651
6,1,simulated_annealing,123,1000,0.788365,0.788549,21.573902,0.788406,0.0
7,1,simulated_annealing,123,6000,0.425769,0.435323,137.159869,0.431886,0.281011



Algorithm = genetic_alg
1000 - train loss = 0.816, val loss = 0.818


In [None]:
data = pd.read_csv("C:/Users/sagni/Documents/Personal Files/CS7641/CS7641/Data/Self_generated/Generated_data.csv")
trainX, trainY, testX, testY = stratified_sampling(data, 'target', 0.4, 903949505)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(trainX)
trainX = pd.DataFrame(scaler.transform(trainX), columns=trainX.columns, index=trainX.index)
testX = pd.DataFrame(scaler.transform(testX), columns=testX.columns, index=testX.index)
display(trainX.head())
display(testX.head())

In [None]:
Output = neural_network(trainX, testX, trainY, testY)
output_path = r'C:\Users\sagni\Documents\Personal Files\CS7641\Assignment2\NN_Images\Output_SelfGenerated.xlsx'
Output.to_excel(output_path, index=False)