# Machine learning and deep learning methods applied to predicting customer status

Purpose: Predict active and not active clients based on the proposed data structure and naive data structure. Here, we implemented Support Vector Machines (SVM), Deep Neural Networks (DNN), Random Forest (RF), K-nearest Neighbours (KNN) and Lasso.

Author: Gabriel Rodrigues Palma and Rafael de Andrade Moral

# Packages used in the project

In [159]:
# visualisation modules
import matplotlib.pyplot as plt

# Data manipulation modules
import numpy as np
import pandas as pd

# Machine learning modules
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Deep learning modules
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers

# Machine learning packages
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import collections
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, KFold
from sklearn.model_selection import LeaveOneOut
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn import svm
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import Lasso
from sklearn.preprocessing import LabelBinarizer

# Additional packages
from keras.layers import Input, Dense, Conv2D, MaxPooling2D, Conv2DTranspose, UpSampling2D, Flatten, Reshape
from keras.models import Model
from keras.optimizers import Adam
from keras.regularizers import L1
from keras import metrics
import numpy as np
import matplotlib.pyplot as plt
import keras.backend as K
import tensorflow as tf

# Testing GPU from MacOs
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


# Functions used in the project

In [157]:
def create_dataset(path):
    ''' This function reads and prepare the datasets for applying the ML and DL methods'''
    data = pd.read_csv(path)
    data = data.drop(columns = 'Unnamed: 0')
    explanatory_variables = data.drop(columns = 'status')
    response_variables = data['status']
    binarizer = LabelBinarizer()
    response_variables = binarizer.fit_transform(response_variables)
    
    onehot_encoder = OneHotEncoder(sparse=False)
    hot_encode_response_variable = onehot_encoder.fit_transform(np.array(response_variables).reshape(-1, 1))  
    
    scaler = MinMaxScaler() # Scaling the variables
    scaler.fit(explanatory_variables)
    explanatory_variables = scaler.transform(explanatory_variables)

    
    return(explanatory_variables, hot_encode_response_variable, response_variables)

def check_zero_division_and_get_rates(cm):
    ''' This functions checks for divisions per zeros in the computation of 
       True and false positive rate based on the confusion matrix array. Also, 
       this function returns the checked rates'''
    
    if any(np.sum(cm, axis = 1)==0):            
            tpr = cm[:,1][1]/(np.sum(cm, axis = 1)[1]+1e-16)
            fpr = cm[:,1][0]/(np.sum(cm, axis = 1)[0]+1e-16)    
    else:           
        rates = cm[:,1]/np.sum(cm, axis = 1)    
        fpr = rates[0]    
        tpr = rates[1]
    if np.isnan(tpr):        
        tpr=0
        
    return(tpr, fpr)

def check_and_compute_rates(predictions, 
                            classes, 
                            cm):    
    ''' This function returns the values of true and false positive rate for special cases
       where the division is not possible to obtain automatically based on the confusion
       matrix array provided by sklearn'''
        
    if (sum(predictions) == 0 and sum(classes) == 0):        
        fpr = 0
        tpr = 0
    elif (sum(predictions) == len(predictions) and sum(classes) == len(classes)):        
        fpr = 0
        tpr = 1
    else:            
        tpr, fpr = check_zero_division_and_get_rates(cm)
        
    return(tpr, fpr)
    
def get_rates(y_pred,
              y_true):
    ''' This function get the true and false positive rates based on the 
       predictied'''
            
    cm = confusion_matrix(y_true = y_true, y_pred = y_pred)
    tpr, fpr = check_and_compute_rates(y_pred, y_true, cm)
        
    return(tpr, fpr)

def get_rates_by_cross_validation(raw_data):
    ''' This function obtains the accuracy, true and false positive rates 
       based on the cross k-fold cross validation'''
            
    for train_index, test_index in KFold(n_splits=5, shuffle=True).split(patterns):
        
        x_train, x_test = patterns[train_index], patterns[test_index]
        y_train, y_test = classes[train_index], classes[test_index]
                
        pbp_predictions = pbp_prediction(patterns_array=x_test, 
                                         clustered_patterns = clustered_patterns, 
                                         d_base = d_base, alpha = alpha, 
                   outbreak_p_means = prediction.obtain_p_means_with_distance,
                   outbreak_prediction = prediction.predict_with_distance)
        rates = get_rates(predictions = pbp_predictions, classes = y_test)
        tpr.append(rates[0])
        fpr.append(rates[1])
    
    return(np.mean(tpr[tpr!=np.nan]), np.mean(fpr[fpr!=np.nan]))

def get_statistics(y_pred, y_true):
    ''' This function obtains the following statistics: Accuracy, True positive rate and 
       False positive rate'''
    acc = accuracy_score(y_pred = y_pred,
                         y_true = y_true)
    rates = get_rates(y_pred = y_pred,
              y_true = y_true)
    tpr = rates[0]
    fpr = rates[1]
    return(acc, tpr, fpr)

def get_methods_performance(X_train, X_test, 
                            y_train, y_test, 
                            raw_data):
    ''' This function obtain the performance of each selected model'''
    # Support Vector Machine -----
    ## Non-linear
    nonlinear_svm = svm.NuSVC(gamma="auto")
    nonlinear_svm.fit(X_train, y_train)
    nonlinear_svm_predictions = nonlinear_svm.predict(X_test)
    
    ## Polinomial
    polinomial_svm = svm.SVC(kernel='poly', degree = 3)
    polinomial_svm.fit(X_train, y_train)
    polinomial_svm_predictions = polinomial_svm.predict(X_test)
    
    # Random forest -----
    rf = RandomForestClassifier(max_depth=5, random_state=0, n_estimators = 1000)
    rf.fit(X_train, y_train)
    random_forest_predictions = rf.predict(X_test)
    
    # KNN -----
    knn = KNeighborsClassifier(n_neighbors=1)
    knn.fit(X_train, y_train)
    knn_predictions = knn.predict(X_test)
    
    # Lasso -----
    lasso = Lasso(alpha=0.02)
    lasso.fit(X_train, y_train)
    lasso_predictions = np.round(lasso.predict(X_test))
    
    # Deep Neural Network -----
    if raw_data:
        dnn = keras.models.load_model('MainFunctions/DNN_raw.h5')
        dnn_predictions = np.round(dnn.predict(X_test))
    else:    
        dnn = keras.models.load_model('MainFunctions/DNN_model_params.h5')
        dnn_predictions = np.round(dnn.predict(X_test))

    
    # Obtaining statistics -----
    polynomial_svm_performance = get_statistics(y_pred = polinomial_svm_predictions,
                                        y_true = y_test)
    nonlinear_svm_performance = get_statistics(y_pred = nonlinear_svm_predictions,
                                       y_true = y_test)
    random_forest_performance = get_statistics(y_pred = random_forest_predictions,
                                       y_true = y_test)
    knn_performance = get_statistics(y_pred = knn_predictions,
                                       y_true = y_test)    
    lasso_performance = get_statistics(y_pred = lasso_predictions,
                                       y_true = y_test)
    dnn_performance = get_statistics(y_pred = dnn_predictions,
                                       y_true = y_test)

    return(polynomial_svm_performance, nonlinear_svm_performance, 
           random_forest_performance, knn_performance, 
           lasso_performance, dnn_performance)
    
def get_results_data(explanatory_variables,                             
                     response_variables,                                                         
                     test_size, raw_data):
    ''' This function obtain the performance of the ML and DL methods based on
       the prediction of client status'''
    X_train, X_test, y_train, y_test = train_test_split(explanatory_variables, 
                                                        response_variables, 
                                                        test_size = test_size, 
                                                        random_state = 42)
   
    # Obtaining methods performance
    polynomial_svm_performance, nonlinear_svm_performance, \
    random_forest_performance, knn_performance, \
    lasso_performance, dnn_performance= get_methods_performance(X_train, X_test, 
                                              y_train, y_test, 
                                              raw_data)
        
    
    model_outputs_performance = pd.DataFrame({'Statistics':['Accuracy', 
                                                            'True Positive Rate', 
                                                            'False Positive Rate'],
                                              'Polinomial SVM': [polynomial_svm_performance[0], 
                                                                 polynomial_svm_performance[1], 
                                                                 polynomial_svm_performance[2]], 
                                              'Non linear SVM': [nonlinear_svm_performance[0], 
                                                                 nonlinear_svm_performance[1], 
                                                                 nonlinear_svm_performance[2]], 
                                              'Random Forest': [random_forest_performance[0], 
                                                                random_forest_performance[1], 
                                                                random_forest_performance[2]], 
                                              'KNN': [knn_performance[0], 
                                                      knn_performance[1], 
                                                      knn_performance[2]],                                                
                                              'Lasso': [lasso_performance[0], 
                                                        lasso_performance[1], 
                                                        lasso_performance[2]], 
                                              'DNN':   [dnn_performance[0], 
                                                        dnn_performance[1], 
                                                        dnn_performance[2]]})

    return(model_outputs_performance)
    

# Importing Data sets

## Proposed data structure

In [139]:
explanatory_variables, hot_encode_response_variable, response_variables = create_dataset('../input_data/customer_profiles_from_model.csv')

In [134]:
response_variables.ravel()

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [153]:
get_results_data(explanatory_variables = explanatory_variables,                             
                 response_variables = response_variables,                                                         
                 test_size = 0.3, 
                 raw_data = False)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  rf.fit(X_train, y_train)
  return self._fit(X, y)
  return self._fit(X, y)




2022-07-25 13:37:59.262829: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Unnamed: 0,Statistics,Polinomial SVM,Non linear SVM,Random Forest,KNN,Lasso,DNN
0,Accuracy,0.916667,0.75,0.666667,0.75,0.666667,0.833333
1,True Positive Rate,1.0,0.8,1.0,0.6,0.8,1.0
2,False Positive Rate,0.142857,0.285714,0.571429,0.142857,0.428571,0.285714


In [158]:
get_results_data(explanatory_variables = explanatory_variables,                             
                 response_variables = response_variables,                                                         
                 test_size = 0.3, 
                 raw_data = False)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  rf.fit(X_train, y_train)
  return self._fit(X, y)
2022-07-25 13:45:39.924000: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




Unnamed: 0,Statistics,Polinomial SVM,Non linear SVM,Random Forest,KNN,Lasso,DNN
0,Accuracy,0.916667,0.75,0.666667,0.75,0.666667,0.833333
1,True Positive Rate,1.0,0.8,1.0,0.6,0.8,1.0
2,False Positive Rate,0.142857,0.285714,0.571429,0.142857,0.428571,0.285714


## Naive data strucure

In [112]:
naive_explanatory_variables, naive_hot_encode_response_variable, naive_response_variables = create_dataset('../input_data/raw_data_naive.csv')

In [154]:
get_results_data(explanatory_variables = naive_explanatory_variables,                             
                 response_variables = naive_response_variables,                                                         
                 test_size = 0.3,
                 raw_data = True)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  rf.fit(X_train, y_train)
  return self._fit(X, y)
  return self._fit(X, y)




2022-07-25 13:38:04.679819: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Unnamed: 0,Statistics,Polinomial SVM,Non linear SVM,Random Forest,KNN,Lasso,DNN
0,Accuracy,0.416667,0.583333,0.25,0.333333,0.5,0.416667
1,True Positive Rate,1.0,0.4,0.4,0.8,1.0,1.0
2,False Positive Rate,1.0,0.285714,0.857143,1.0,0.857143,1.0
