## Generate predictions on the test data set

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import itertools
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, accuracy_score, cohen_kappa_score
%matplotlib inline

class_names = ["A", "B", "None"]

In [2]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    #     print("Normalized confusion matrix")
    # else:
    #     print('Confusion matrix, without normalization')

    # print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

### Load Scikit Learn Model

In [3]:
import pickle

model = pickle.load(open("models/adaboost_kappa_0_2836.pickle.dat", "rb"))



### Load Keras Model

In [4]:
# the model must be in the models folder
# model_to_use = "model-004-0.82.h5"
# model_to_use = "NN_kappa_0_2897.h5"

# from keras.models import load_model
# model = load_model('models/'+model_to_use)

In [5]:
df = pd.read_csv("original_data/custdatabase.csv")
df_copy = pd.read_csv("original_data/custdatabase.csv")
print('Test dataset shape {0}, {1}'.format(df.shape[0], df.shape[1]))

Test dataset shape 4000, 10


In [6]:
# strip the spaces in the header, if present
df = df.rename(columns=lambda x: x.strip())
df.columns.values

array(['index', 'sex', 'mstatus', 'age', 'children', 'occupation',
       'education', 'income', 'avbal', 'avtrans'], dtype=object)

In [7]:
categorical_features = ["sex", "mstatus", "occupation", "education", "children"]
df[categorical_features].head()
df["children"] = df["children"].astype(str)
label_encoders = {}
label_mappings = {}
for categorical_feature in categorical_features:
    label_encoders[categorical_feature] = preprocessing.LabelEncoder()
    df[categorical_feature + "Num"] = label_encoders[categorical_feature].fit_transform(df[categorical_feature])
    label_mappings[categorical_feature] = label_encoders[categorical_feature].classes_

label_encoders = {}
label_mappings = {}
for categorical_feature in categorical_features:
    label_encoders[categorical_feature] = preprocessing.LabelEncoder()
    df[categorical_feature + "Num"] = label_encoders[categorical_feature].fit_transform(df[categorical_feature])
    label_mappings[categorical_feature] = label_encoders[categorical_feature].classes_

for categorical_feature in categorical_features:
    for class_value in label_mappings[categorical_feature]:
        df[categorical_feature + "_" + (class_value)] = df[categorical_feature] == np.array([(class_value)] * df.shape[0])
        df[categorical_feature + "_" + (class_value)] = df[categorical_feature + "_" + (class_value)].astype(int)

# drop the categorical values
df = df.drop(categorical_features, axis=1)

df.head()

Unnamed: 0,index,age,income,avbal,avtrans,sexNum,mstatusNum,occupationNum,educationNum,childrenNum,...,occupation_retired,education_postgrad,education_professional,education_secondary,education_tertiary,children_0,children_1,children_2,children_3,children_4
0,1001,44.27,10245.73,53183.04,4069.67,1,0,5,0,2,...,0,1,0,0,0,0,0,1,0,0
1,1002,61.9,1942.57,18100.78,1522.42,1,1,5,2,1,...,0,0,0,1,0,0,1,0,0,0
2,1003,37.3,9896.16,24496.82,2723.35,1,0,0,3,1,...,0,0,0,0,1,0,1,0,0,0
3,1004,25.02,10136.44,26690.01,4143.04,0,0,5,0,1,...,0,1,0,0,0,0,1,0,0,0
4,1005,48.37,2521.11,13439.81,240.26,0,0,4,2,2,...,0,0,0,1,0,0,0,1,0,0


### Remove the "Num" columns only if required

In [8]:
# remove num columns for Neural networks
isNeuralNetwork = False

In [9]:
if isNeuralNetwork:
    num_columns = ["sexNum", "mstatusNum", "occupationNum", "educationNum", "childrenNum"]
    df = df.drop(num_columns, axis=1)
else:
    print("Not dropping the num columns")
df.head()

Not dropping the num columns


Unnamed: 0,index,age,income,avbal,avtrans,sexNum,mstatusNum,occupationNum,educationNum,childrenNum,...,occupation_retired,education_postgrad,education_professional,education_secondary,education_tertiary,children_0,children_1,children_2,children_3,children_4
0,1001,44.27,10245.73,53183.04,4069.67,1,0,5,0,2,...,0,1,0,0,0,0,0,1,0,0
1,1002,61.9,1942.57,18100.78,1522.42,1,1,5,2,1,...,0,0,0,1,0,0,1,0,0,0
2,1003,37.3,9896.16,24496.82,2723.35,1,0,0,3,1,...,0,0,0,0,1,0,1,0,0,0
3,1004,25.02,10136.44,26690.01,4143.04,0,0,5,0,1,...,0,1,0,0,0,0,1,0,0,0
4,1005,48.37,2521.11,13439.81,240.26,0,0,4,2,2,...,0,0,0,1,0,0,0,1,0,0


## Get the normalizers based on the train data

In [10]:
train_df = pd.read_csv("working_data/trial_promo_training.csv")
features_to_scale = ["age", "income", "avbal", "avtrans"]
min_max_scaler = preprocessing.MinMaxScaler()

# fit on the train data
train_df[features_to_scale] = min_max_scaler.fit(train_df[features_to_scale])

# transform the test data
df[features_to_scale] = min_max_scaler.transform(df[features_to_scale])
df.head()

Unnamed: 0,index,age,income,avbal,avtrans,sexNum,mstatusNum,occupationNum,educationNum,childrenNum,...,occupation_retired,education_postgrad,education_professional,education_secondary,education_tertiary,children_0,children_1,children_2,children_3,children_4
0,1001,0.364876,0.510499,0.675107,0.497236,1,0,5,0,2,...,0,1,0,0,0,0,0,1,0,0
1,1002,0.610214,0.081311,0.222841,0.182472,1,1,5,2,1,...,0,0,0,1,0,0,1,0,0,0
2,1003,0.267882,0.49243,0.305296,0.330871,1,0,0,3,1,...,0,0,0,0,1,0,1,0,0,0
3,1004,0.096994,0.50485,0.33357,0.506303,0,0,5,0,1,...,0,1,0,0,0,0,1,0,0,0
4,1005,0.421932,0.111215,0.162754,0.024034,0,0,4,2,2,...,0,0,0,1,0,0,0,1,0,0


In [11]:
customer_ids = df["index"]
df = df.drop("index", axis=1)
df.head()

Unnamed: 0,age,income,avbal,avtrans,sexNum,mstatusNum,occupationNum,educationNum,childrenNum,sex_F,...,occupation_retired,education_postgrad,education_professional,education_secondary,education_tertiary,children_0,children_1,children_2,children_3,children_4
0,0.364876,0.510499,0.675107,0.497236,1,0,5,0,2,0,...,0,1,0,0,0,0,0,1,0,0
1,0.610214,0.081311,0.222841,0.182472,1,1,5,2,1,0,...,0,0,0,1,0,0,1,0,0,0
2,0.267882,0.49243,0.305296,0.330871,1,0,0,3,1,0,...,0,0,0,0,1,0,1,0,0,0
3,0.096994,0.50485,0.33357,0.506303,0,0,5,0,1,1,...,0,1,0,0,0,0,1,0,0,0
4,0.421932,0.111215,0.162754,0.024034,0,0,4,2,2,1,...,0,0,0,1,0,0,0,1,0,0


In [12]:
df.columns

Index(['age', 'income', 'avbal', 'avtrans', 'sexNum', 'mstatusNum',
       'occupationNum', 'educationNum', 'childrenNum', 'sex_F', 'sex_M',
       'mstatus_divorced', 'mstatus_married', 'mstatus_single',
       'mstatus_widowed', 'occupation_IT', 'occupation_construct',
       'occupation_education', 'occupation_finance', 'occupation_government',
       'occupation_legal', 'occupation_manuf', 'occupation_medicine',
       'occupation_retired', 'education_postgrad', 'education_professional',
       'education_secondary', 'education_tertiary', 'children_0', 'children_1',
       'children_2', 'children_3', 'children_4'],
      dtype='object')

## Perform the predictions using the model

In [13]:
if isNeuralNetwork:
    X_test = df
    X_test.head(5)
else:
    feature_columns = ["age", "income", "avbal", "avtrans", "sexNum", "mstatusNum", "occupationNum", "educationNum", "childrenNum"]
    X_test = df.loc[:, feature_columns]
    X_test.head(5)

In [14]:
if isNeuralNetwork:
    model_predictions = model.predict(X_test)
    model_predictions = np.argmax(model_predictions, axis=1)
else:
    model_predictions = model.predict(X_test)
    
df_copy["status"] = model_predictions
df_copy["status"] = df_copy["status"].astype(int)
df_copy["status"].head()

0    2
1    0
2    2
3    1
4    2
Name: status, dtype: int32

In [15]:
df_copy["status"] = df_copy["status"].replace(0, "A").replace(1, "B").replace(2, "None")
df_copy.head(50)

Unnamed: 0,index,sex,mstatus,age,children,occupation,education,income,avbal,avtrans,status
0,1001,M,divorced,44.27,2,legal,postgrad,10245.73,53183.04,4069.67,
1,1002,M,married,61.9,1,legal,secondary,1942.57,18100.78,1522.42,A
2,1003,M,divorced,37.3,1,IT,tertiary,9896.16,24496.82,2723.35,
3,1004,F,divorced,25.02,1,legal,postgrad,10136.44,26690.01,4143.04,B
4,1005,F,divorced,48.37,2,government,secondary,2521.11,13439.81,240.26,
5,1006,M,widowed,49.19,3,manuf,secondary,610.07,8027.86,817.78,
6,1007,F,married,66.07,2,retired,tertiary,3153.71,13146.35,1716.52,A
7,1008,F,divorced,65.81,1,retired,postgrad,3656.7,6914.14,2149.3,
8,1009,F,married,44.65,2,IT,tertiary,5825.72,22830.97,1196.25,A
9,1010,F,single,23.16,0,IT,tertiary,2737.99,13859.05,776.99,B


### Merge status predicted by classifier and investment score generated by Fuzzy Rule

In [16]:
df_investment_score = pd.read_csv("results/predicted_scores.csv")
df_investment_score = df_investment_score.loc[:,["index", "score"]]
df_investment_score.head(5)

Unnamed: 0,index,score
0,1001,5.07
1,1002,3.8
2,1003,4.38
3,1004,4.55
4,1005,3.09


In [17]:
df_merged = df_copy.set_index('index').join(df_investment_score.set_index('index'))
df_merged.head(5)

Unnamed: 0_level_0,sex,mstatus,age,children,occupation,education,income,avbal,avtrans,status,score
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1001,M,divorced,44.27,2,legal,postgrad,10245.73,53183.04,4069.67,,5.07
1002,M,married,61.9,1,legal,secondary,1942.57,18100.78,1522.42,A,3.8
1003,M,divorced,37.3,1,IT,tertiary,9896.16,24496.82,2723.35,,4.38
1004,F,divorced,25.02,1,legal,postgrad,10136.44,26690.01,4143.04,B,4.55
1005,F,divorced,48.37,2,government,secondary,2521.11,13439.81,240.26,,3.09


In [18]:
df_merged.to_csv("results/cust_actual_predicted_adaboost.csv")