# DATA MINING- Clustering, CART, Random Forest & Artificial Neural Network

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy.stats import zscore
from kneed import KneeLocator
from sklearn.cluster import KMeans 
from scipy.cluster.hierarchy import fcluster
from sklearn.metrics import silhouette_samples, silhouette_score
from scipy.cluster.hierarchy import dendrogram, linkage
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler 
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

### Problem 1: Clustering
### A leading bank wants to develop a customer segmentation to give promotional offers to its customers. They collected a sample that summarizes the activities of users during the past few months. You are given the task to identify the segments based on credit card usage.

In [None]:
mkt = pd.read_csv('bank_marketing_part1_Data-1.csv')

### 1.1  Read the data and do exploratory data analysis. Describe the data briefly.

In [None]:
mkt.head()

In [None]:
mkt.shape 

In [None]:
mkt.info()

#### Data Summary- Measures of Central Tendency & Measures of Dispersion

In [None]:
mkt.describe()

In [None]:
print("data:",mkt.mode())

In [None]:
print(mkt.var())

In [None]:
print(mkt.max() - mkt.min())

In [None]:
mkt.quantile(0.75) - mkt.quantile(0.25)

In [None]:
cv = mkt.std()/mkt.mean()
print(cv)

#### Shape of the Data

In [None]:
mkt.skew() 

#### Five Number Summary

In [None]:
mkt.boxplot(column="spending",figsize=(6,6))

plt.text(x=0.74, y=17.3, s="3rd Quartile")
plt.text(x=0.8, y=14.3, s="Median")
plt.text(x=0.75, y=12.2, s="1st Quartile")
plt.text(x=0.9, y=10.5, s="Min")
plt.text(x=0.9, y=21.1, s="Max")

In [None]:
mkt.boxplot(column="advance_payments",figsize=(6,6))

plt.text(x=0.74, y=15.7, s="3rd Quartile")
plt.text(x=0.8, y=14.3, s="Median")
plt.text(x=0.75, y=13.4, s="1st Quartile")
plt.text(x=0.9, y=12.4, s="Min")
plt.text(x=0.9, y=17.2, s="Max")

In [None]:
mkt.boxplot(column="probability_of_full_payment",figsize=(6,6))

plt.text(x=0.74, y=0.887, s="3rd Quartile")
plt.text(x=0.8, y=0.873, s="Median")
plt.text(x=0.75, y=0.856, s="1st Quartile")
plt.text(x=0.9, y=0.808, s="Min")
plt.text(x=0.9, y=0.918, s="Max")

In [None]:
mkt.boxplot(column="current_balance",figsize=(6,6))

plt.text(x=0.74, y=5.9, s="3rd Quartile")
plt.text(x=0.8, y=5.5, s="Median")
plt.text(x=0.75, y=5.2, s="1st Quartile")
plt.text(x=0.9, y=4.8, s="Min")
plt.text(x=0.9, y=6.6, s="Max")

In [None]:
mkt.boxplot(column="credit_limit",figsize=(6,6))

plt.text(x=0.74, y=3.5, s="3rd Quartile")
plt.text(x=0.8, y=3.2, s="Median")
plt.text(x=0.75, y=2.9, s="1st Quartile")
plt.text(x=0.9, y=2.6, s="Min")
plt.text(x=0.9, y=4.0, s="Max")

In [None]:
mkt.boxplot(column="min_payment_amt",figsize=(6,6))

plt.text(x=0.74, y=4.7, s="3rd Quartile")
plt.text(x=0.8, y=3.5, s="Median")
plt.text(x=0.75, y=2.5, s="1st Quartile")
plt.text(x=0.9, y=0.7, s="Min")
plt.text(x=0.9, y=8.4, s="Max")

In [None]:
mkt.boxplot(column="max_spent_in_single_shopping",figsize=(6,6))

plt.text(x=0.74, y=5.8, s="3rd Quartile")
plt.text(x=0.8, y=5.2, s="Median")
plt.text(x=0.75, y=5.0, s="1st Quartile")
plt.text(x=0.9, y=4.5, s="Min")
plt.text(x=0.9, y=6.5, s="Max")

#### Covariance & Correlation of the Data

In [None]:
mkt.cov()

In [None]:
mkt.corr()

In [None]:
fig,ax = plt.subplots(figsize=(7,5.5))   
sns.heatmap(mkt.corr(), ax=ax, annot=True, linewidths=0.05, fmt= '.2f',cmap="magma") # the color intensity is based on 
plt.show()

### 1.2  Do you think scaling is necessary for clustering in this case? Justify

#### * Yes, scaling is necessary. 
#### * Scaling, in general, is done so that all the variables under consideration is given the same weightage. In the given dataset, spending is in 1000's, advance payments is in 100's and credit limit is in 10000's- this implies that different variables will be given different weightages. 
#### * Clustering techniques use Distance methods like Euclidean, Manhattan etc., to compute distances between clusters which is highly affected by the unscaled variables making the models ineffective.
#### * Hence Scaling is very important in case of Clustering.

### 1.3 Apply hierarchical clustering to scaled data. Identify the number of optimum clusters using Dendrogram and briefly describe them

In [None]:
mkt_hc = pd.read_csv('bank_marketing_part1_Data-1.csv')

In [None]:
df_1 = mkt_hc.apply(zscore)
df_1.head()

In [None]:
wardlink = linkage(df_1, method = 'ward')

In [None]:
dend = dendrogram(wardlink)

In [None]:
dend = dendrogram(wardlink,truncate_mode='lastp',p = 6,)

In [None]:
clusters = fcluster(wardlink, 3, criterion='maxclust')
clusters

In [None]:
clusters = fcluster(wardlink, 15, criterion='distance')
clusters

In [None]:
mkt_hc['clusters'] = clusters

In [None]:
mkt_hc.to_csv('hc.csv')

### 1.4 Apply K-Means clustering on scaled data and determine optimum clusters. Apply elbow curve and silhouette score.

In [None]:
mkt_km = pd.read_csv('bank_marketing_part1_Data-1.csv')

In [None]:
df_2 = mkt_km.apply(zscore)
df_2.head()

In [None]:
k_means = KMeans(n_clusters = 2)

In [None]:
k_means.fit(df_2)

In [None]:
k_means.labels_

In [None]:
k_means.inertia_

In [None]:
k_means_1 = KMeans(n_clusters = 1)
k_means_1.fit(df_2)
k_means_1.inertia_

In [None]:
k_means_2 = KMeans(n_clusters = 2)
k_means_2.fit(df_2)
k_means_2.inertia_

In [None]:
k_means_3 = KMeans(n_clusters = 3)
k_means_3.fit(df_2)
k_means_3.inertia_

In [None]:
k_means_4 = KMeans(n_clusters = 4)
k_means_4.fit(df_2)
k_means_4.inertia_

In [None]:
k_means_5 = KMeans(n_clusters = 5)
k_means_5.fit(df_2)
k_means_5.inertia_

In [None]:
k_means_6 = KMeans(n_clusters = 6)
k_means_6.fit(df_2)
k_means_6.inertia_

In [None]:
wss =[] 
for i in range(1,11):
    KM = KMeans(n_clusters=i)
    KM.fit(df_2)
    wss.append(KM.inertia_)
print(wss)

In [None]:
plt.plot(range(1,11), wss);

In [None]:
k_means_3 = KMeans(n_clusters = 3)
k_means_3.fit(df_2)
labels = k_means_3.labels_

In [None]:
mkt_km["Clus_kmeans"] = labels
mkt_km.head(5)

In [None]:
silhouette_score(df_2,labels)

In [None]:
sil_width = silhouette_samples(mkt_km,labels)

In [None]:
mkt_km["sil_width"] = sil_width
mkt_km.head(5)

In [None]:
silhouette_samples(df_2,labels).min()

In [None]:
mkt_km.to_csv('km.csv')

### 1.5 Describe cluster profiles for the clusters defined. Recommend different promotional strategies for different clusters.

In [None]:
mkt_grouped_hc = mkt_hc.groupby(mkt_hc.clusters)

In [None]:
mkt_cluster_1_hc = mkt_grouped_hc.get_group(1)
mkt_cluster_2_hc = mkt_grouped_hc.get_group(2)
mkt_cluster_3_hc = mkt_grouped_hc.get_group(3)

In [None]:
mkt_cluster_1_hc.describe()

In [None]:
mkt_cluster_2_hc.describe()

In [None]:
mkt_cluster_3_hc.describe()

In [None]:
mkt_grouped_km = mkt_km.groupby(mkt_km.Clus_kmeans)

In [None]:
mkt_cluster_1_km = mkt_grouped_km.get_group(0)
mkt_cluster_2_km = mkt_grouped_km.get_group(1)
mkt_cluster_3_km = mkt_grouped_km.get_group(2)

In [None]:
mkt_cluster_1_km.describe()

In [None]:
mkt_cluster_2_km.describe()

In [None]:
mkt_cluster_3_km.describe()

Note:
•	Cluster 1 in Hierarchical Clustering is mapped to Cluster 0 in K-Means.
•	Cluster 2 in Hierarchical Clustering is mapped to Cluster 2 in K-Means.
•	Cluster 3 in Hierarchical Clustering is mapped to Cluster 1 in K-Means.

A.	Promotional Strategies for Cluster 1 in Hierarchical Clustering & Cluster 0 in K-Means: 

This cluster is the richer category observing their ‘spending’, ‘advance payments’, ‘credit limit’, ‘minimum payment amount’ and ‘maximum spent in single shopping’ which is the highest amongst the others.
To convert them into more profitable customers:

i.	Increase their credit limit, so that they use it more frequently as the chance of them being converted into bad customers are very less.

ii.	Since they are the highest spenders of the three clusters, they can be offered a few loyalty rewards to hold them back as our customers. This may include a round-trip to various countries, cashback/reward points to the highest spender(s) of the week.


B.	Promotional Strategies for Cluster 2 in Hierarchical Clustering & Cluster 2 in K-Means: 

This cluster comes under the ‘aspiring spenders’ (middle income) category as almost all the parameters under consideration are closer to the previous category. 

Since their aspirations to spend is high, the chances that they will churn out is also high. To hold them as a loyal customers, instead of increasing their credit limit (as this can prove to be a risky category of customer as well), the bank should offer more attractive promotional strategy to this group than the remaining group of customers. 


C.	Promotional Strategies for Cluster 3 in Hierarchical Clustering & Cluster 1 in K-Means: 

This is the category consisting of low-income people with lowest spending and advance payments. However, the minimum payment amount is the highest.

The best promotional strategies for this group would be:

i.	Providing an EMI option on purchases (say, up to maximum of 6 months) which might sound attractive to them and they can easily purchase now and pay later.
This is also one of the ways to generate some interest to the banks from this category.

ii.	Instead of providing cashbacks/other promotional offers, the best suited promotional strategies would be to provide product discounts (say 2% to 5% on particular essential products). This way, the chances that these customer will churn out would reduce.

### Problem 2: CART-RF-ANN
### An Insurance firm providing tour insurance is facing higher claim frequency. The management decides to collect data from the past few years. You are assigned the task to make a model which predicts the claim status and provide recommendations to management. Use CART, RF & ANN and compare the models' performances in train and test sets. 


In [None]:
ins = pd.read_csv('insurance_part2_data-2.csv')

### 2.1  Data Ingestion: Read the dataset. Do the descriptive statistics and do null value condition check, write an inference on it.

In [None]:
ins.head()

In [None]:
ins.shape 

In [None]:
ins.info()

#### Data Summary- Measures of Central Tendency & Measures of Dispersion

In [None]:
ins.describe()

In [None]:
print("data:",ins.mode())

In [None]:
print(ins.var())

In [None]:
Q1 = ins.quantile(0.25)
Q3 = ins.quantile(0.75)
IQR = Q3 - Q1

In [None]:
cv = ins.std()/ins.mean()
print(cv)

#### Shape of the Data

In [None]:
ins.skew() 

#### Five Number Summary

In [None]:
ins.boxplot(column="Age",figsize=(6,6))

In [None]:
ins.boxplot(column="Commision",figsize=(6,6))

In [None]:
ins.boxplot(column="Duration",figsize=(6,6))

In [None]:
ins.boxplot(column="Sales",figsize=(6,6))

#### Covariance & Correlation of the Data

In [None]:
ins.cov()

In [None]:
ins.corr()

In [None]:
fig,ax = plt.subplots(figsize=(7,5.5))   
sns.heatmap(ins.corr(), ax=ax, annot=True, linewidths=0.05, fmt= '.2f',cmap="magma")
plt.show()

#### Checking for Missing Values 

In [None]:
ins.isnull().sum()

#### Checking for Duplicate Rows & Dropping them, if any

In [None]:
dups = ins.duplicated()
print('Number of duplicate rows = %d' % (dups.sum()))
ins[dups].head()

In [None]:
ins = ins.drop_duplicates()

In [None]:
ins.loc[ins['Duration'] > 1000]

In [None]:
ins['Duration'].replace(0,ins['Duration'].median(),inplace=True)

In [None]:
ins['Duration'].replace(-1,ins['Duration'].median(),inplace=True)

#### Values Counts for the Categorical Variables

In [None]:
print('Agency_code \n',ins.Agency_Code.value_counts())
print('\n')
print('Type \n',ins.Type.value_counts())
print('\n')
print('Claimed \n',ins.Claimed.value_counts())
print('\n')
print('Channel \n',ins.Channel.value_counts())
print('\n')
print('Product Name \n',ins.Product_Name.value_counts())
print('\n')
print('Destination \n',ins.Destination.value_counts())

#### Converting the Categorical to Numerical to facilitate the Model Building

In [None]:
for feature in ins.columns: 
    if ins[feature].dtype == 'object': 
        print('\n')
        print('feature:',feature)
        print(pd.Categorical(ins[feature].unique()))
        print(pd.Categorical(ins[feature].unique()).codes)
        ins[feature] = pd.Categorical(ins[feature]).codes

#### Treating the Outliers- IQR Method

In [None]:
def remove_outlier(col):
    sorted(col)
    Q1,Q3=np.percentile(col,[25,75])
    IQR=Q3-Q1
    lower_range= Q1-(1.5 * IQR)
    upper_range= Q3+(1.5 * IQR)
    return lower_range, upper_range

In [None]:
lratio,uratio=remove_outlier(ins['Age'])
ins['Age']=np.where(ins['Age']>uratio,uratio,ins['Age'])
ins['Age']=np.where(ins['Age']<lratio,lratio,ins['Age'])

lraxis,uraxis=remove_outlier(ins['Commision'])
ins['Commision']=np.where(ins['Commision']>uraxis,uraxis,ins['Commision'])
ins['Commision']=np.where(ins['Commision']<lraxis,lraxis,ins['Commision'])

lraspect,uraspect=remove_outlier(ins['Duration'])
ins['Duration']=np.where(ins['Duration']>uraspect,uraspect,ins['Duration'])
ins['Duration']=np.where(ins['Duration']<lraspect,lraspect,ins['Duration'])

lrscaled_var,urscaled_var=remove_outlier(ins['Sales'])
ins['Sales']=np.where(ins['Sales']>urscaled_var,urscaled_var,ins['Sales'])
ins['Sales']=np.where(ins['Sales']<lrscaled_var,lrscaled_var,ins['Sales'])

In [None]:
ins.boxplot(column="Sales",figsize=(6,6))

In [None]:
ins.boxplot(column="Age",figsize=(6,6))

In [None]:
ins.boxplot(column="Commision",figsize=(6,6))

In [None]:
ins.boxplot(column="Duration",figsize=(6,6))

### 2.2  Data Split: Split the data into test and train, build classification model CART, Random Forest, Artificial Neural Network

#### Defining the Independent & Dependent Variables

In [None]:
X = ins.drop("Claimed", axis=1)
y = ins.pop("Claimed")

#### Splitting the data into Train & Test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.30, random_state=1)

In [None]:
print('X_train',X_train.shape)
print('X_test',X_test.shape)
print('train_labels',y_train.shape)
print('test_labels',y_test.shape)

#### DECISION TREE CLASSIFIER

In [None]:
model_dt = DecisionTreeClassifier(criterion = 'gini')

In [None]:
model_dt.fit(X_train,y_train)

In [None]:
from sklearn import tree

train_char_label = ['No', 'Yes']
Insurance_Tree_File = open('Insurance_Tree_File.dot','w')
dot_data = tree.export_graphviz(model_dt, 
                                out_file=Insurance_Tree_File, 
                                feature_names = list(X_train), 
                                class_names = list(train_char_label))

Insurance_Tree_File.close()

In [None]:
print(pd.DataFrame(model_dt.feature_importances_, columns = ["Imp"], index = X_train.columns).sort_values('Imp',ascending=False))

In [None]:
y_predict = model_dt.predict(X_test)

In [None]:
param_grid_dt = {
    'max_depth': [4,6,7],
    'min_samples_leaf': [30,35,40],
    'min_samples_split': [60,80,120]
}

model_dt = DecisionTreeClassifier(random_state=1)

grid_search_dt = GridSearchCV(estimator = model_dt, param_grid = param_grid_dt, cv = 10)

In [None]:
grid_search_dt.fit(X_train, y_train)

In [None]:
grid_search_dt.best_params_

In [None]:
model_dt_reg = DecisionTreeClassifier(criterion = 'gini', max_depth = 4, min_samples_leaf=30, min_samples_split=120, random_state=1)
model_dt_reg.fit(X_train, y_train)

In [None]:
from sklearn import tree

train_char_label = ['No', 'Yes']
Insurance_Tree_File_reg = open('Insurance_Tree_File_reg.dot','w')
dot_data = tree.export_graphviz(model_dt_reg, 
                                out_file=Insurance_Tree_File_reg, 
                                feature_names = list(X_train), 
                                class_names = list(train_char_label))

Insurance_Tree_File_reg.close()

In [None]:
print (pd.DataFrame(model_dt_reg.feature_importances_, columns = ["Imp"], index = X_train.columns).sort_values('Imp',ascending=False))

In [None]:
y_train_predict_dt = model_dt_reg.predict(X_train)
y_test_predict_dt = model_dt_reg.predict(X_test)

In [None]:
y_train_predict_dt_prob_a = model_dt_reg.predict_proba(X_train)
y_test_predict_dt_prob_a = model_dt_reg.predict_proba(X_test)

In [None]:
model_dt_reg.score(X_train,y_train)

In [None]:
model_dt_reg.score(X_test,y_test)

#### RANDOM FOREST CLASSIFIER

In [None]:
param_grid_rf = {
    'max_depth': [6,7,8],
    'max_features': [5,6,7,8],
    'min_samples_leaf': [20,30,40],
    'min_samples_split': [40,45,50],
    'n_estimators': [100,200,300]
}

rfcl = RandomForestClassifier(random_state = 1)
rfcl.fit(X_train, y_train)

grid_search_rf = GridSearchCV(estimator = rfcl, param_grid = param_grid_rf, cv = 10)

In [None]:
grid_search_rf.fit(X_train, y_train)

In [None]:
grid_search_rf.best_params_

In [None]:
best_grid_rf = grid_search_rf.best_estimator_

In [None]:
model_rf_reg = RandomForestClassifier(n_estimators = 201,max_depth = 7,max_features = 7,min_samples_leaf = 20 , min_samples_split = 45 ,oob_score = True,random_state = 1)
model_rf_reg.fit(X_train, y_train)

In [None]:
y_train_predict_rf = model_rf_reg.predict(X_train)
y_test_predict_rf = model_rf_reg.predict(X_test)

In [None]:
y_train_predict_rf_prob_a = model_rf_reg.predict_proba(X_train)
y_test_predict_rf_prob_a = model_rf_reg.predict_proba(X_test)

In [None]:
model_rf_reg.score(X_train,y_train)

In [None]:
model_rf_reg.score(X_test,y_test)

In [None]:
model_rf_reg.oob_score_

In [None]:
1-0.7667332667332667

In [None]:
print (pd.DataFrame(model_rf_reg.feature_importances_, columns = ["Imp"], index = X_train.columns).sort_values('Imp',ascending=False))

#### MLP CLASSIFIER (ARTIFICIAL NEURAL NETWORK)

#### A target variable with a large spread of values, in turn, may result in large error gradient values causing weight values to change dramatically, making the learning process unstable. Scaling input and output variables is a critical step in using neural network models.

In [None]:
z = StandardScaler() 
X_train_z = z.fit_transform(X_train) 
X_test_z = z.transform (X_test)

In [None]:
param_grid_mlp = {
    'hidden_layer_sizes': [(100,100,100)],
    'activation': ['logistic', 'relu'],
    'solver': ['sgd', 'adam'],
    'tol': [0.1,0.01,0.001],
    'max_iter' : [150,300]
}

nncl = MLPClassifier(random_state=1)

grid_search_mlp = GridSearchCV(estimator = nncl, param_grid = param_grid_mlp, cv = 10)

In [None]:
grid_search_mlp.fit(X_train_z, y_train)

In [None]:
grid_search_mlp.best_params_

In [None]:
best_grid_mlp = grid_search_mlp.best_estimator_
best_grid_mlp

In [None]:
clf = MLPClassifier(hidden_layer_sizes=(100, 100, 100), activation ='relu',max_iter = 150,solver = 'adam',verbose = True,random_state = 1, tol = 0.001 )

In [None]:
clf.fit(X_train_z, y_train)

In [None]:
y_train_predict_mlp = clf.predict(X_train_z)
y_test_predict_mlp = clf.predict(X_test_z)

### 2.3  Performance Metrics: Check the performance of Predictions on Train and Test sets using Accuracy, Confusion Matrix, Plot ROC curve and get ROC_AUC score for each model

#### A .  DECISION TREE CLASSIFIER

#### A.1  Decision Tree Classifier - TRAIN DATA

In [None]:
confusion_matrix(y_train, y_train_predict_dt)

In [None]:
confusion_matrix(y_train, y_train_predict_dt)
sns.heatmap(confusion_matrix(y_train, y_train_predict_dt),annot=True, fmt='d',cbar=False, cmap="mako")
plt.xlabel('Predicted Label')
plt.ylabel('Actual Label')
plt.title('Confusion Matrix')

In [None]:
#Train Data Accuracy
cart_train_acc=model_rf_reg.score(X_train,y_train) 
cart_train_acc

In [None]:
print(classification_report(y_train,y_train_predict_dt))

In [None]:
cart_metrics=classification_report(y_train, y_train_predict_dt,output_dict=True)
df=pd.DataFrame(cart_metrics).transpose()
cart_train_f1=round(df.loc["1"][2],2)
cart_train_recall=round(df.loc["1"][1],2)
cart_train_precision=round(df.loc["1"][0],2)
print ('cart_train_precision ',cart_train_precision)
print ('cart_train_recall ',cart_train_recall)
print ('cart_train_f1 ',cart_train_f1)

In [None]:
# predict probabilities
probs = model_dt_reg.predict_proba(X_train)

# keep probabilities for the positive outcome only
probs = probs[:, 1]

# calculate AUC
cart_train_auc = roc_auc_score(y_train, probs)
print('AUC: %.3f' % cart_train_auc)

# calculate roc curve
cart_train_fpr, cart_train_tpr, cart_trainthresholds = roc_curve(y_train, probs)
plt.plot([0, 1], [0, 1], linestyle='--')

# plot the roc curve for the model
plt.plot(cart_train_fpr, cart_train_tpr, marker='.')

# show the plot
plt.show()

#### A.2  Decision Tree Classifier - TEST DATA

In [None]:
confusion_matrix(y_test, y_test_predict_dt)

In [None]:
confusion_matrix(y_test, y_test_predict_dt)
sns.heatmap(confusion_matrix(y_test, y_test_predict_dt),annot=True, fmt='d',cbar=False, cmap="mako")
plt.xlabel('Predicted Label')
plt.ylabel('Actual Label')
plt.title('Confusion Matrix')

In [None]:
#Test Data Accuracy
cart_test_acc=model_rf_reg.score(X_test,y_test)
cart_test_acc

In [None]:
print(classification_report(y_test,y_test_predict_dt))

In [None]:
cart_metrics=classification_report(y_test,y_test_predict_dt,output_dict=True)
df=pd.DataFrame(cart_metrics).transpose()
cart_test_precision=round(df.loc["1"][0],2)
cart_test_recall=round(df.loc["1"][1],2)
cart_test_f1=round(df.loc["1"][2],2)
print ('cart_test_precision ',cart_test_precision)
print ('cart_test_recall ',cart_test_recall)
print ('cart_test_f1 ',cart_test_f1)

In [None]:
# predict probabilities
probs = model_dt_reg.predict_proba(X_test)

# keep probabilities for the positive outcome only
probs = probs[:, 1]

# calculate AUC
cart_test_auc = roc_auc_score(y_test, probs)
print('AUC: %.3f' % cart_test_auc)

# calculate roc curve
cart_test_fpr, cart_test_tpr, cart_testthresholds = roc_curve(y_test, probs)
plt.plot([0, 1], [0, 1], linestyle='--')

# plot the roc curve for the model
plt.plot(cart_test_fpr, cart_test_tpr, marker='.')

# show the plot
plt.show()

#### B.  RANDOM FOREST CLASSIFIER

#### B.1  Random Forest Classifier - TRAIN DATA

In [None]:
confusion_matrix(y_train, y_train_predict_rf)

In [None]:
confusion_matrix(y_train, y_train_predict_rf)
sns.heatmap(confusion_matrix(y_train, y_train_predict_rf),annot=True, fmt='d',cbar=False, cmap="mako")
plt.xlabel('Predicted Label')
plt.ylabel('Actual Label')
plt.title('Confusion Matrix')

In [None]:
rf_train_acc=model_rf_reg.score(X_train,y_train) 
rf_train_acc

In [None]:
print(classification_report(y_train,y_train_predict_rf))

In [None]:
rf_metrics=classification_report(y_train,y_train_predict_rf,output_dict=True)
df=pd.DataFrame(rf_metrics).transpose()
rf_train_precision=round(df.loc["1"][0],2)
rf_train_recall=round(df.loc["1"][1],2)
rf_train_f1=round(df.loc["1"][2],2)
print ('rf_train_precision ',rf_train_precision)
print ('rf_train_recall ',rf_train_recall)
print ('rf_train_f1 ',rf_train_f1)

In [None]:
# predict probabilities
probs = model_rf_reg.predict_proba(X_train)

# keep probabilities for the positive outcome only
probs = probs[:, 1]

# calculate AUC
rf_train_auc = roc_auc_score(y_train, probs)
print('AUC: %.3f' % rf_train_auc)

# calculate roc curve
rf_train_fpr,rf_train_tpr, thresholds = roc_curve(y_train, probs)
plt.plot([0, 1], [0, 1], linestyle='--')

# plot the roc curve for the model
plt.plot(rf_train_fpr,rf_train_tpr, marker='.')

# show the plot
plt.show()

#### B.2   Random Forest Classifier - TEST DATA

In [None]:
confusion_matrix(y_test, y_test_predict_rf)

In [None]:
confusion_matrix(y_test, y_test_predict_rf)
sns.heatmap(confusion_matrix(y_test, y_test_predict_rf),annot=True, fmt='d',cbar=False, cmap="mako")
plt.xlabel('Predicted Label')
plt.ylabel('Actual Label')
plt.title('Confusion Matrix')

In [None]:
rf_test_acc=model_rf_reg.score(X_test,y_test)
rf_test_acc

In [None]:
print(classification_report(y_test,y_test_predict_rf))

In [None]:
rf_metrics=classification_report(y_test,y_test_predict_rf,output_dict=True)
df=pd.DataFrame(rf_metrics).transpose()
rf_test_precision=round(df.loc["1"][0],2)
rf_test_recall=round(df.loc["1"][1],2)
rf_test_f1=round(df.loc["1"][2],2)
print ('rf_test_precision ',rf_test_precision)
print ('rf_test_recall ',rf_test_recall)
print ('rf_test_f1 ',rf_test_f1)

In [None]:
# predict probabilities
probs = model_rf_reg.predict_proba(X_test)

# keep probabilities for the positive outcome only
probs = probs[:, 1]

# calculate AUC
rf_test_auc = roc_auc_score(y_test, probs)
print('AUC: %.3f' % rf_test_auc)

# calculate roc curve
rf_test_fpr,rf_test_tpr, thresholds = roc_curve(y_test, probs)
plt.plot([0, 1], [0, 1], linestyle='--')

# plot the roc curve for the model
plt.plot(rf_test_fpr,rf_test_tpr, marker='.')

# show the plot
plt.show()

#### C.  MLP CLASSIFIER (ARTIFICIAL NEURAL NETWORK)

#### C.1  Artificial Neural Network - TRAIN DATA

In [None]:
confusion_matrix(y_train,y_train_predict_mlp)

In [None]:
confusion_matrix(y_train,y_train_predict_mlp)
sns.heatmap(confusion_matrix(y_train,y_train_predict_mlp),annot=True, fmt='d',cbar=False, cmap="mako")
plt.xlabel('Predicted Label')
plt.ylabel('Actual Label')
plt.title('Confusion Matrix')

In [None]:
nn_train_acc=clf.score(X_train_z,y_train) 
nn_train_acc

In [None]:
print(classification_report(y_train,y_train_predict_mlp))

In [None]:
nn_metrics=classification_report(y_train, y_train_predict_mlp,output_dict=True)
df=pd.DataFrame(nn_metrics).transpose()
nn_train_precision=round(df.loc["1"][0],2)
nn_train_recall=round(df.loc["1"][1],2)
nn_train_f1=round(df.loc["1"][2],2)
print ('nn_train_precision ',nn_train_precision)
print ('nn_train_recall ',nn_train_recall)
print ('nn_train_f1 ',nn_train_f1)

In [None]:
# predict probabilities
probs = clf.predict_proba(X_train_z)

# keep probabilities for the positive outcome only
probs = probs[:, 1]

# calculate AUC
nn_train_auc = roc_auc_score(y_train, probs)
print('AUC: %.3f' % nn_train_auc)

# calculate roc curve
nn_train_fpr,nn_train_tpr, thresholds = roc_curve(y_train, probs)
plt.plot([0, 1], [0, 1], linestyle='--')

# plot the roc curve for the model
plt.plot(nn_train_fpr,nn_train_tpr, marker='.')

# show the plot
plt.show()

#### C.2  Artificial Neural Network - TEST DATA

In [None]:
confusion_matrix(y_test,y_test_predict_mlp)

In [None]:
confusion_matrix(y_test,y_test_predict_mlp)
sns.heatmap(confusion_matrix(y_test,y_test_predict_mlp),annot=True, fmt='d',cbar=False, cmap="mako")
plt.xlabel('Predicted Label')
plt.ylabel('Actual Label')
plt.title('Confusion Matrix')

In [None]:
nn_test_acc=clf.score(X_test_z,y_test)
nn_test_acc

In [None]:
print(classification_report(y_test,y_test_predict_mlp))

In [None]:
nn_metrics=classification_report(y_test, y_test_predict_mlp,output_dict=True)
df=pd.DataFrame(nn_metrics).transpose()
nn_test_precision=round(df.loc["1"][0],2)
nn_test_recall=round(df.loc["1"][1],2)
nn_test_f1=round(df.loc["1"][2],2)
print ('nn_test_precision ',nn_test_precision)
print ('nn_test_recall ',nn_test_recall)
print ('nn_test_f1 ',nn_test_f1)

In [None]:
# predict probabilities
probs = clf.predict_proba(X_test_z)

# keep probabilities for the positive outcome only
probs = probs[:, 1]

# calculate AUC
nn_test_auc = roc_auc_score(y_test, probs)
print('AUC: %.3f' % nn_test_auc)

# calculate roc curve
nn_test_fpr,nn_test_tpr, thresholds = roc_curve(y_test, probs)
plt.plot([0, 1], [0, 1], linestyle='--')

# plot the roc curve for the model
plt.plot(nn_test_fpr,nn_test_tpr, marker='.')

# show the plot
plt.show()

### 2.4 Final Model: Compare all the model and write an inference which model is best/optimized.

In [None]:
index=['Accuracy', 'AUC', 'Recall','Precision','F1 Score']
data = pd.DataFrame({'CART Train':[cart_train_acc,cart_train_auc,cart_train_recall,cart_train_precision,cart_train_f1],
        'CART Test':[cart_test_acc,cart_test_auc,cart_test_recall,cart_test_precision,cart_test_f1],
       'Random Forest Train':[rf_train_acc,rf_train_auc,rf_train_recall,rf_train_precision,rf_train_f1],
        'Random Forest Test':[rf_test_acc,rf_test_auc,rf_test_recall,rf_test_precision,rf_test_f1],
       'Neural Network Train':[nn_train_acc,nn_train_auc,nn_train_recall,nn_train_precision,nn_train_f1],
        'Neural Network Test':[nn_test_acc,nn_test_auc,nn_test_recall,nn_test_precision,nn_test_f1]},index=index)
round(data,2)

In [None]:
plt.plot([0, 1], [0, 1], linestyle='--')
plt.plot(cart_train_fpr, cart_train_tpr,color='red',label="CART")
plt.plot(rf_train_fpr,rf_train_tpr,color='green',label="RF")
plt.plot(nn_train_fpr,nn_train_tpr,color='black',label="NN")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Train')
plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc='lower right')

In [None]:
plt.plot([0, 1], [0, 1], linestyle='--')
plt.plot(cart_test_fpr, cart_test_tpr,color='red',label="CART")
plt.plot(rf_test_fpr,rf_test_tpr,color='green',label="RF")
plt.plot(nn_test_fpr,nn_test_tpr,color='black',label="NN")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Test')
plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc='lower right')

•	Comparing the Train and Test Dataset, the accuracy for all the three models under consideration is not showing much of a difference. Also, considering only accuracy i.e., how accurately does the model classify the data points may not give us a full picture of the solution. 

•	Looking at the AUC Score, the obvious preference would be Random Forest.

•	Further, from Precision, which helps us identify how many are really positive among the positives identified as positives by the model again Random Forest is preferred and is closely followed by Neural Network. This is important from an insurance company perspective whose is particularly facing a higher claim for the tour insurance. 

•	It is strenuous to compare two models if there is low recall and high precision or high recall and low precision value. For this purpose, we can use f1-Score to compare the models. The highest f1-Score is observed for Random Forest model which is followed by Decision Tree.

In conclusion, all the models have performed more or less in a similar fashion when compared with respect to all the metrics considered. In this case, out of all, Random Forest has performed slightly better than the rest (with a small difference of +0.05).


### 2.5  Inference: Basis on these predictions, what are the business insights and recommendations?

•	Of the three models, Random Forest has performed slightly better than the other two models: Decision Tree and Neural Network. 

•	It is of prime importance that we use build a model which is efficient in predicting the claim status for us to provide valuable insights to the business.

•	If the model is a black-box technique wherein setting the hyper-parameters is crucial, it can prove to be difficult to rely on that particular model to make a business acumen.

•	However, in this particular case, all the three models have shown as a stable performance considering all the metrics of comparison.

•	One useful metrics from Decision Tree and Random Forest is the variable importance which stands crucial for the business. From both the models, we saw that ‘Agency Code’ was the most important variable followed by ‘Sales’ and ‘Product Name’. These variables are potential identifiers for whether the insured claimed or not.

•	On the other hand, the variables that are of least importance are: ‘Type’ and ‘Channel’.