# Predicting Purchase Behavior(PB) based on Attitude(ATTD), Social Norm(SN) and Purchase Behavioral Control(PBC)

# Installing Dependecies

In [14]:
pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


# Importing libraries

In [15]:
import math
import pandas as pd #dataframe
import numpy as np #mathematical computations
import matplotlib.pyplot as plt #visualization
import matplotlib
import joblib
import seaborn as sns #visualization
import json
import pickle #saving the model
import scikitplot as skplt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split #Splitting the dataset into training and testing
from sklearn.model_selection import ShuffleSplit #Random shuffling
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from statistics import stdev
from warnings import simplefilter

import warnings
warnings.filterwarnings('ignore')


# Importing Dataset

In [16]:
#reading the .xlsx dataset in to the dataframe
df1 = pd.read_excel("Data v1.0.xlsx")

In [17]:
df1.head()

Unnamed: 0,ID,Gender,Age,Ethnic,Occupation,Annual Income,Social Media usage,Experience of purchasing due to social media influence,Reviews and rating impact on purchasing decision,Average time spent on the Internet (weekly),...,Attitude 3 (ATTD3),Attitude 4 (ATTD4),Social Norms 1 (SN1),Social Norms 2 (SN2),Social Norms 3 (SN3),Social Norms 4 (SN4),Perceived Behavioural Control 1 (PBC1),Perceived Behavioural Control 2 (PBC2),Perceived Behavioural Control 3 (PBC3),Perceived Behavioural Control 4 (PBC4)
0,1,Female,29 - 34 Years Old,Chinese,Homemaker,"RM50,001-RM70,000",Yes,Yes,Yes,More than 40 hours,...,Neutral,Neutral,Neutral,Neutral,Neutral,Neutral,Neutral,Neutral,Neutral,Neutral
1,2,Male,23 - 28 Years Old,Chinese,Student,"RM50,001-RM70,000",Yes,Yes,Yes,More than 40 hours,...,Agree,Neutral,Strongly disagree,Disagree,Agree,Neutral,Neutral,Agree,Disagree,Disagree
2,3,Female,29 - 34 Years Old,Chinese,Homemaker,"Less than RM30,000",Yes,Yes,Yes,5 hours – 10 hours,...,Strongly disagree,Disagree,Strongly disagree,Strongly disagree,Strongly disagree,Strongly disagree,Strongly disagree,Strongly disagree,Strongly disagree,Strongly disagree
3,4,Male,35 - 40 Years Old,Malay,Businessman,"More than RM90,001",Yes,Yes,No,1 hour – 4 hours,...,Agree,Neutral,Neutral,Neutral,Agree,Agree,Neutral,Agree,Neutral,Agree
4,5,Female,23 - 28 Years Old,Chinese,Employee,"RM70,001-RM90,000",Yes,Yes,Yes,10 hours – 20 hours,...,Agree,Agree,Neutral,Disagree,Neutral,Neutral,Neutral,Agree,Agree,Agree


In [18]:
df1.shape

(219, 27)

In [19]:
df1.columns

Index(['ID', 'Gender', 'Age', 'Ethnic', 'Occupation', 'Annual Income',
       'Social Media usage',
       'Experience of purchasing due to social media influence',
       'Reviews and rating impact on purchasing decision',
       'Average time spent on the Internet (weekly)',
       'Attention to advertisement on social media',
       'Purchase Behaviour 1 (PB1)', 'Purchase Behaviour 2 (PB2)',
       'Purchase Behaviour 3 (PB3)', 'Purchase Behaviour 4 (PB4)',
       'Attitude 1 (ATTD1)', 'Attitude 2 (ATTD2)', 'Attitude 3 (ATTD3)',
       'Attitude 4 (ATTD4)', 'Social Norms 1 (SN1)', 'Social Norms 2 (SN2)',
       'Social Norms 3 (SN3)', 'Social Norms 4 (SN4)',
       'Perceived Behavioural Control 1 (PBC1)',
       'Perceived Behavioural Control 2 (PBC2)',
       'Perceived Behavioural Control 3 (PBC3)',
       'Perceived Behavioural Control 4 (PBC4)'],
      dtype='object')

In [21]:
df1["Gender"].unique()

array(['Female', 'Male'], dtype=object)

# Exploratory Data Analysis (EDA)

In [22]:
#droping the column ID since it's not important
df2 = df1.drop(['ID'],axis=1)
df2.head()

Unnamed: 0,Gender,Age,Ethnic,Occupation,Annual Income,Social Media usage,Experience of purchasing due to social media influence,Reviews and rating impact on purchasing decision,Average time spent on the Internet (weekly),Attention to advertisement on social media,...,Attitude 3 (ATTD3),Attitude 4 (ATTD4),Social Norms 1 (SN1),Social Norms 2 (SN2),Social Norms 3 (SN3),Social Norms 4 (SN4),Perceived Behavioural Control 1 (PBC1),Perceived Behavioural Control 2 (PBC2),Perceived Behavioural Control 3 (PBC3),Perceived Behavioural Control 4 (PBC4)
0,Female,29 - 34 Years Old,Chinese,Homemaker,"RM50,001-RM70,000",Yes,Yes,Yes,More than 40 hours,Yes,...,Neutral,Neutral,Neutral,Neutral,Neutral,Neutral,Neutral,Neutral,Neutral,Neutral
1,Male,23 - 28 Years Old,Chinese,Student,"RM50,001-RM70,000",Yes,Yes,Yes,More than 40 hours,Yes,...,Agree,Neutral,Strongly disagree,Disagree,Agree,Neutral,Neutral,Agree,Disagree,Disagree
2,Female,29 - 34 Years Old,Chinese,Homemaker,"Less than RM30,000",Yes,Yes,Yes,5 hours – 10 hours,No,...,Strongly disagree,Disagree,Strongly disagree,Strongly disagree,Strongly disagree,Strongly disagree,Strongly disagree,Strongly disagree,Strongly disagree,Strongly disagree
3,Male,35 - 40 Years Old,Malay,Businessman,"More than RM90,001",Yes,Yes,No,1 hour – 4 hours,Yes,...,Agree,Neutral,Neutral,Neutral,Agree,Agree,Neutral,Agree,Neutral,Agree
4,Female,23 - 28 Years Old,Chinese,Employee,"RM70,001-RM90,000",Yes,Yes,Yes,10 hours – 20 hours,No,...,Agree,Agree,Neutral,Disagree,Neutral,Neutral,Neutral,Agree,Agree,Agree


In [23]:
df2.isnull().any()

Gender                                                    False
Age                                                       False
Ethnic                                                    False
Occupation                                                False
Annual Income                                             False
Social Media usage                                        False
Experience of purchasing due to social media influence    False
Reviews and rating impact on purchasing decision          False
Average time spent on the Internet (weekly)               False
Attention to advertisement on social media                False
Purchase Behaviour 1 (PB1)                                False
Purchase Behaviour 2 (PB2)                                False
Purchase Behaviour 3 (PB3)                                False
Purchase Behaviour 4 (PB4)                                False
Attitude 1 (ATTD1)                                        False
Attitude 2 (ATTD2)                      

In [None]:
df2.isnull().sum()

In [None]:
df2 = df2.dropna(how='any',axis=0)
df2.isnull().sum()

In [None]:
df2.shape
df2.head(20)

In [None]:
df2.nunique()

In [None]:
df2.rename(columns={'Annual Income': 'Annual_Income',
                   'Social Media usage': 'Social_Media_usage',
                   'Experience of purchasing due to social media influence': 'Experience',
                   'Reviews and rating impact on purchasing decision': 'Reviews_and_rating',
                   'Average time spent on the Internet (weekly)': 'ATI',
                   'Attention to advertisement on social media': 'ADSM',
                   'Purchase Behaviour 1 (PB1)': 'PB1',
                   'Purchase Behaviour 2 (PB2)': 'PB2',
                   'Purchase Behaviour 3 (PB3)': 'PB3',
                   'Purchase Behaviour 4 (PB4)': 'PB4',
                   'Attitude 1 (ATTD1)': 'ATTD1',
                   'Attitude 2 (ATTD2)': 'ATTD2',
                   'Attitude 3 (ATTD3)': 'ATTD3',
                   'Attitude 4 (ATTD4)': 'ATTD4',
                   'Social Norms 1 (SN1)': 'SN1',
                   'Social Norms 2 (SN2)': 'SN2',
                   'Social Norms 3 (SN3)': 'SN3',
                   'Social Norms 4 (SN4)': 'SN4',
                   'Perceived Behavioural Control 1 (PBC1)': 'PBC1',
                   'Perceived Behavioural Control 2 (PBC2)': 'PBC2',
                   'Perceived Behavioural Control 3 (PBC3)': 'PBC3',
                   'Perceived Behavioural Control 4 (PBC4)': 'PBC4',
                   },
          inplace=True, errors='raise')

In [None]:
df2.head()

In [None]:
df2.Gender.unique()

In [None]:
df2.Age.unique()

In [None]:
df2.Ethnic.unique()

In [None]:
df2.Occupation.unique()

In [None]:
df2.Annual_Income.unique()

In [None]:
df2.Social_Media_usage.unique()

In [None]:
df2.Experience.unique()

In [None]:
df2.Reviews_and_rating.unique()

In [None]:
df2.ATI.unique()

In [None]:
df2.ADSM.unique()

In [None]:
df2.PB1.unique()

In [None]:
df2.PB2.unique()

In [None]:
df2.PB3.unique()

In [None]:
df2.PB4.unique()

In [None]:
df2.ATTD1.unique()
df2.ATTD2.unique()
df2.ATTD3.unique()
df2.ATTD4.unique()

In [None]:
df2.SN1.unique()
df2.SN2.unique()
df2.SN3.unique()
df2.SN4.unique()

In [None]:
df2.PBC1.unique()
df2.PBC2.unique()
df2.PBC3.unique()
df2.PBC4.unique()

In [None]:
df2.describe()

### VISUALIZATION

In [None]:
# fig, ax = plt.subplots(1, 4, figsize=(20, 10))
# fig.suptitle('Count Plot', fontsize=16, x=.5)

# columns = ['Gender', 'Age', 'Ethnic','Occupation']
# for i, col in enumerate(columns):
#     graph = sns.countplot(x=df2[col], ax=ax[i])
#     ax[i].set_title(*[col])
#     graph.bar_label(graph.containers[0])

features = ['Gender', 'Age', 'Ethnic','Occupation', 'Annual_Income', 'Social_Media_usage', 'Experience', 'Reviews_and_rating', 'ATI', 'ADSM' ]

for f in features:
    sns.countplot(x = f, data = df2, palette = 'Set1')
    plt.xticks(rotation=45)
    plt.show()

In [None]:
# fig, ax = plt.subplots(1, 3, figsize=(25, 10))
# fig.suptitle('Count Plot', fontsize=16, x=0.92)

# columns = ['Annual_Income', 'Social_Media_usage', 'Experience']
# for i, col in enumerate(columns):
#     graph = sns.countplot(x=df2[col], ax=ax[i])
#     ax[i].set_title(*[col])
#     graph.bar_label(graph.containers[0])

In [None]:
# fig, ax = plt.subplots(1, 3, figsize=(25, 10))
# fig.suptitle('Count Plot', fontsize=16, x=0.92)

# columns = ['Reviews_and_rating', 'ATI', 'ADSM']
# for i, col in enumerate(columns):
#     graph = sns.countplot(x=df2[col], ax=ax[i])
#     ax[i].set_title(*[col])
#     graph.bar_label(graph.containers[0])

In [None]:
#count plot of Attitude (ATTD)
features = [ 'ATTD1','ATTD2','ATTD3','ATTD4']

for f in features:
    sns.countplot(x = f, data = df2, palette = 'Set2')
    plt.xticks(rotation=45)
    plt.show()

In [None]:
#count plot of Social Norms (SN)
features = [ 'SN1','SN2','SN3','SN4']

for f in features:
    sns.countplot(x = f, data = df2, palette = 'Set2')
    plt.xticks(rotation=45)
    plt.show()

In [None]:
#count plot of Perceived Behavioural Control (PBC)
features = [ 'PBC1','PBC2','PBC3','PBC4']

for f in features:
    sns.countplot(x = f, data = df2, palette = 'Set2')
    plt.xticks(rotation=45)
    plt.show()

In [None]:
#count plot of Purchase Behavior (PB)
features = [ 'PB1','PB2','PB3','PB4']

for f in features:
    sns.countplot(x = f, data = df2, palette = 'Set2')
    plt.xticks(rotation=45)
    plt.show()

# DATA PREPROCESSING

In [None]:
#replacing the categorical value of PB, ATTD, SN and PBC to numerical value
df2 = df2.replace('Strongly agree',5)
df2 = df2.replace('Agree',4)
df2 = df2.replace('Neutral',3)
df2 = df2.replace('Disagree',2)
df2 = df2.replace('Strongly disagree',1)

In [None]:
# Merging PB1, PB2, PB3, PB4 to PB
df2['PB'] = df2['PB1'] + df2['PB2']+ df2['PB3']+ df2['PB4']

In [None]:
#Diving the PB target attribute to two class High and Low, we are using mean value 10 as diving criteria
#and Appending a new target attribute PB-inf
rating = []
for row in df2['PB']:
    if row<=12:
        rating.append('LOW')
    else:
        rating.append('HIGH')
df2['PB-inf'] = rating

In [None]:
df2.head()

In [None]:
#set seaborn plotting aesthetics as default
# sns.set()

#define plotting region (2 rows, 2 columns)
fig, axes = plt.subplots(2, 2, figsize=(25, 25))

#create boxplot in each subplot
sns.boxplot(data=df2, x='PB', y='Gender', ax=axes[0,0])
sns.boxplot(data=df2, x='PB', y='Age', ax=axes[0,1])
sns.boxplot(data=df2, x='PB', y='Ethnic', ax=axes[1,0])
sns.boxplot(data=df2, x='PB', y='Occupation', ax=axes[1,1])

In [None]:
#define plotting region (2 rows, 2 columns)
fig, axes = plt.subplots(2, 2, figsize=(15, 15))

#create boxplot in each subplot
sns.boxplot(data=df2, x='PB', y='Annual_Income', ax=axes[0,0])
sns.boxplot(data=df2, x='PB', y='Social_Media_usage', ax=axes[0,1])
sns.boxplot(data=df2, x='PB', y='Experience', ax=axes[1,0])
sns.boxplot(data=df2, x='PB', y='Reviews_and_rating', ax=axes[1,1])

In [None]:
#define plotting region (2 rows, 1 columns)
fig, axes = plt.subplots(2, 1, figsize=(15, 15))

#create boxplot in each subplot
sns.boxplot(data=df2, x='PB', y='ATI', ax=axes[0])
sns.boxplot(data=df2, x='PB', y='ADSM', ax=axes[1])

In [None]:
#Droping  PB1,PB2,PB3,PB4,PB and keeping PB-inf instead
df3= df2.drop(columns = ['PB1','PB2','PB3','PB4','PB'], inplace = True)
print(df2.shape)
df2.head()

In [None]:
#According to Theory of Planned behaviour, in order to predict Purchase Behaviour-PB we only need find the relationship between ATTD, SN, PBC
#Hence Droping all the column except ATTD, SN, PBC to Predict PB
colsToDrop = ['Gender', 'Age', 'Ethnic','Occupation','Annual_Income','Social_Media_usage','Experience','Reviews_and_rating','ATI','ADSM']
df2.drop(columns = colsToDrop, inplace = True)
y = df2['PB-inf']
X = df2.drop(columns = 'PB-inf')

# Hold Out Validation

In [None]:
#Spliting Ratio 80:20
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 20, stratify = y)

# ZeroR Classifier

##### ZeroR classifer used to detect whether the dataset is balanced or not

In [None]:
# The Confusion matrix Function
def plot_confusion_matrix(cm, classes=None, title='Confusion matrix'):

    if classes is not None:
        sns.heatmap(cm, xticklabels=classes, yticklabels=classes, vmin=0., vmax=1., annot=True, annot_kws={'size':50})
    else:
        sns.heatmap(cm, vmin=0., vmax=1.)
    plt.title(title)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

dummy_clf = DummyClassifier(strategy = 'most_frequent', random_state = 20)
dummy_clf.fit(X_train, y_train)
y_pred = dummy_clf.predict(X_test)

print(classification_report(y_test, y_pred))
print("Train Accuracy: ", accuracy_score(y_train, dummy_clf.predict(X_train)))
print("Test Accuracy: ", accuracy_score(y_test, y_pred))

In [None]:
y_pred = dummy_clf.predict(X_test)

# Plotting Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
cm_norm = cm/cm.sum(axis=1)[:, np.newaxis]
plt.figure(figsize = (10,5))
plot_confusion_matrix(cm, classes=dummy_clf.classes_, title='Test set confusion')

# KNN Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2) # Minkowski Distance Metric and K = 5
knn_clf.fit(X_train, y_train)
y_pred = knn_clf.predict(X_test)

#plotting consusion matrix
skplt.metrics.plot_confusion_matrix(y_test, y_pred);
print(classification_report(y_test, y_pred))
print("Train Accuracy: ", accuracy_score(y_train, knn_clf.predict(X_train)))
print("Test Accuracy: ", accuracy_score(y_test, y_pred))

In [None]:
#plotting ROC curve
y_probas = knn_clf.predict_proba(X_test)
skplt.metrics.plot_roc(y_test, y_probas, figsize = (8,6), plot_micro = False, plot_macro = True,
                      title = "ROC Curve for KNN Classifier");

In [None]:
#plotting PRC curve
skplt.metrics.plot_precision_recall(y_test, y_probas, figsize = (8,6), plot_micro = False,
                                   title = "Precision-Recall Curve for KNN Classifier");

In [None]:
#Plotting learning curve
from sklearn.model_selection import ShuffleSplit
cv = ShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
skplt.estimators.plot_learning_curve(knn_clf, X, y, cv = cv,title = "KNN Classifier");

# Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

tree_clf = DecisionTreeClassifier(criterion = 'entropy', max_depth= 5)
tree_clf.fit(X_train, y_train)
train_prediction = tree_clf.predict(X_train)
test_prediction = tree_clf.predict(X_test)

#plotting confusion matrix for DT
skplt.metrics.plot_confusion_matrix(y_test, test_prediction);
print(classification_report(y_test, test_prediction))
print("Train Accuracy: ", accuracy_score(y_train, train_prediction))
print("Test Accuracy: ", accuracy_score(y_test, test_prediction))

In [None]:
#Plotting ROC curve for DT
y_probas = tree_clf.predict_proba(X_test)
skplt.metrics.plot_roc(y_test, y_probas, figsize = (8,6), plot_micro = False, plot_macro = True,
                      title = "ROC Curve for Decision Tree Classifier");

In [None]:
#plotting PRC curve for DT
skplt.metrics.plot_precision_recall(y_test, y_probas, figsize = (8,6), plot_micro = False,
                                   title = "Precision-Recall Curve for Decision Classifier");

In [None]:
#Plotting learning curve for DT
from sklearn.model_selection import ShuffleSplit
cv = ShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
skplt.estimators.plot_learning_curve(tree_clf, X, y, cv = cv,title = "Decision Tress Classifier");

In [None]:
from sklearn import tree
tree.plot_tree(tree_clf)

In [None]:
#Visualization of DT using graphviz
import graphviz
dot_data = tree.export_graphviz(tree_clf, out_file=None, filled = True, rounded = True, special_characters=False)
graph = graphviz.Source(dot_data)
graph

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

model = LogisticRegression(C =0.1, solver ='newton-cg')
model.fit(X_train, y_train)
pred_train = model.predict(X_train)
pred_test = model.predict(X_test)

#ploting confusion matrix for logistic regression
skplt.metrics.plot_confusion_matrix(y_test, pred_test);
print(classification_report(y_test, pred_test))
print("Train Accuracy", accuracy_score(y_train, pred_train))
print("Test Accuracy", accuracy_score(y_test, pred_test))

In [None]:
#ploting ROC curve for logistic regression
y_probas = model.predict_proba(X_test)
skplt.metrics.plot_roc(y_test, y_probas, figsize = (8,6), plot_micro = False, plot_macro = True,
                      title = "ROC Curve for Logistic Regression");

In [None]:
#ploting PRC for logistic regression
skplt.metrics.plot_precision_recall(y_test, y_probas, figsize = (8,6), plot_micro = False,
                                   title = "Precision-Recall Curve for Logistic Regression");

In [None]:
#ploting learning curve for logistic regression
from sklearn.model_selection import ShuffleSplit
cv = ShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
skplt.estimators.plot_learning_curve(model, X, y, cv = cv,title = "Logistic Regression");

# Naive Bayes Classifier

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

naive = GaussianNB()
naive.fit(X_train, y_train)
y_pred = naive.predict(X_test)

#ploting confusion matrix for Naive Bayes
skplt.metrics.plot_confusion_matrix(y_test, y_pred);
print(classification_report(y_test, y_pred))
print("Train Accuracy", accuracy_score(y_train, naive.predict(X_train)))
print("Test Accuracy", accuracy_score(y_test, naive.predict(X_test)))

In [None]:
#ploting ROC for Naive Bayes
y_probas = naive.predict_proba(X_test)
skplt.metrics.plot_roc(y_test, y_probas, figsize = (8,6), plot_micro = False, plot_macro = True,
                      title = "ROC Curve for Naive Bayes Classifier");

In [None]:
#ploting PRC for Naive Bayes
skplt.metrics.plot_precision_recall(y_test, y_probas, figsize = (8,6), plot_micro = False,
                                   title = "Precision-Recall Curve for Naive Bayes Classifier");

In [None]:
#ploting learning curve for Naive Bayes
from sklearn.model_selection import ShuffleSplit
cv = ShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
skplt.estimators.plot_learning_curve(naive, X, y, cv = cv,
                      title = "Naive Bayes Classifier");

# Support Vector Machine (SVM) Classifier

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

svm = SVC(C =0.1, gamma='scale', kernel='linear',probability=True)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)

#ploting confusion matrix for SVM
skplt.metrics.plot_confusion_matrix(y_test, y_pred);
print(classification_report(y_test, y_pred))
print("Train Accuracy", accuracy_score(svm.predict(X_train), y_train))
print("Test Accuracy", accuracy_score(svm.predict(X_test), y_test))

In [None]:
#ploting ROC curve for SVM
y_probas = svm.predict_proba(X_test)
skplt.metrics.plot_roc(y_test, y_probas, figsize = (8,6), plot_micro = False, plot_macro = True,
                      title = "ROC Curve for Support Vector Machine (SVM) Classifier");

In [None]:
#ploting PRC curve for SVM
skplt.metrics.plot_precision_recall(y_test, y_probas, figsize = (8,6), plot_micro = False,
                                   title = "Precision-Recall Curve for Support Vector Machine (SVM) Classifier");

In [None]:
#ploting learning for SVM
from sklearn.model_selection import ShuffleSplit
cv = ShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
skplt.estimators.plot_learning_curve(svm, X, y, cv = cv,
                      title = "Support Vector Machine (SVM) Classifier");

# Ensemble Classifiers

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

clf = RandomForestClassifier(criterion='gini', max_depth= 6, n_estimators =100)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
#ploting confusion matrix for RF
skplt.metrics.plot_confusion_matrix(y_test, y_pred);
print(classification_report(y_test, y_pred))
print("Train Accuracy", accuracy_score(clf.predict(X_train), y_train))
print("Test Accuracy", accuracy_score(clf.predict(X_test), y_test))

In [None]:
#ploting ROC curve for RF
y_probas = clf.predict_proba(X_test)
skplt.metrics.plot_roc(y_test, y_probas, figsize = (8,6), plot_micro = False, plot_macro = True,
                      title = "ROC Curve for Random Forest Classifier");

In [None]:
#ploting PRC curve for RF
skplt.metrics.plot_precision_recall(y_test, y_probas, figsize = (8,6), plot_micro = False,
                                   title = "Precision-Recall Curve for Random Forest Classifier");

In [None]:
#ploting Learning curve for RF
from sklearn.model_selection import ShuffleSplit
cv = ShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
skplt.estimators.plot_learning_curve(clf, X, y, cv = cv,title = "Random Forest Classifier");

# AdaBoost Classifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

Ada_clf = AdaBoostClassifier(learning_rate=0.1, n_estimators=50)
Ada_clf.fit(X_train, y_train)
y_pred = Ada_clf.predict(X_test)

#ploting confusion matrix for AdaBoost
skplt.metrics.plot_confusion_matrix(y_test, y_pred);
print(classification_report(y_test, y_pred))
print("Train Accuracy", accuracy_score(Ada_clf.predict(X_train), y_train))
print("Test Accuracy", accuracy_score(Ada_clf.predict(X_test), y_test))

In [None]:
#ploting ROC curve for AdaBoost
y_probas = Ada_clf.predict_proba(X_test)
skplt.metrics.plot_roc(y_test, y_probas, figsize = (8,6), plot_micro = False, plot_macro = True,
                      title = "ROC Curve for AdaBost Classifier");

In [None]:
#ploting PRC curve for Adaboost
skplt.metrics.plot_precision_recall(y_test, y_probas, figsize = (8,6), plot_micro = False,
                                   title = "Precision-Recall Curve for AdaBost Classifier");

In [None]:
#ploting learning curve for Adaboost
from sklearn.model_selection import ShuffleSplit
cv = ShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
skplt.estimators.plot_learning_curve(Ada_clf, X, y, cv = cv,title = "AdaBost Classifier");

#Gradient Boosting Classifier

In [None]:

from sklearn.datasets import make_hastie_10_2
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

xgb_clf = GradientBoostingClassifier(n_estimators=200, learning_rate=1.0, max_depth=2)
xgb_clf.fit(X_train, y_train)
y_pred = xgb_clf.predict(X_test)

#plotting confusion matrix for GB
skplt.metrics.plot_confusion_matrix(y_test, y_pred,title = "Gradient Boosting Classifier (Considering TPB Factors: ATTD, SN, PBC)");
print(classification_report(y_test, y_pred))
print("Train Accuracy", accuracy_score(xgb_clf.predict(X_train), y_train))
print("Test Accuracy", accuracy_score(xgb_clf.predict(X_test), y_test))

In [None]:
#plotting ROC curve for GB
y_probas = xgb_clf.predict_proba(X_test)
skplt.metrics.plot_roc(y_test, y_probas, figsize = (8,6), plot_micro = False, plot_macro = False,
                      title = "Gradient Boosting Classifier (Considering TPB Factors: ATTD, SN, PBC)");

In [None]:
#plotting PRC curve for GB
skplt.metrics.plot_precision_recall(y_test, y_probas, figsize = (10,8), plot_micro = False,
                                   title = "Gradient Boosting Classifier (Considering TPB Factors: ATTD, SN, PBC)");

In [None]:
#plotting learning curve for GB
from sklearn.model_selection import ShuffleSplit
cv = ShuffleSplit(n_splits=1, test_size=0.2, random_state=10)
skplt.estimators.plot_learning_curve(xgb_clf, X, y, cv = cv,title = "Gradient Boosting Classifier (Considering TPB Factors: ATTD, SN, PBC)");