#Installing dependencies

In [None]:
!pip install scikit-plot

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-plot
  Downloading scikit_plot-0.3.7-py3-none-any.whl (33 kB)
Installing collected packages: scikit-plot
Successfully installed scikit-plot-0.3.7


#Importing libraries

In [None]:
import math
import pandas as pd #dataframe
import numpy as np #mathematical computations
import matplotlib.pyplot as plt #visualization
import matplotlib
import joblib
import seaborn as sns #visualization
import json
import pickle #saving the model
import scikitplot as skplt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split #Splitting the dataset into training and testing
from sklearn.model_selection import ShuffleSplit #Random shuffling
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from statistics import stdev
from warnings import simplefilter

import warnings
warnings.filterwarnings('ignore')


#Importing dataset

In [None]:
#reading the .xlsx dataset in to the dataframe
df1 = pd.read_excel("Data v1.0.xlsx")

# Exploratory Data Analysis (EDA)

In [None]:
#droping the column ID since it's not important
df2 = df1.drop(['ID'],axis=1)
df2.head()

Unnamed: 0,Gender,Age,Ethnic,Occupation,Annual Income,Social Media usage,Experience of purchasing due to social media influence,Reviews and rating impact on purchasing decision,Average time spent on the Internet (weekly),Attention to advertisement on social media,...,Attitude 3 (ATTD3),Attitude 4 (ATTD4),Social Norms 1 (SN1),Social Norms 2 (SN2),Social Norms 3 (SN3),Social Norms 4 (SN4),Perceived Behavioural Control 1 (PBC1),Perceived Behavioural Control 2 (PBC2),Perceived Behavioural Control 3 (PBC3),Perceived Behavioural Control 4 (PBC4)
0,Female,29 - 34 Years Old,Chinese,Homemaker,"RM50,001-RM70,000",Yes,Yes,Yes,More than 40 hours,Yes,...,Neutral,Neutral,Neutral,Neutral,Neutral,Neutral,Neutral,Neutral,Neutral,Neutral
1,Male,23 - 28 Years Old,Chinese,Student,"RM50,001-RM70,000",Yes,Yes,Yes,More than 40 hours,Yes,...,Agree,Neutral,Strongly disagree,Disagree,Agree,Neutral,Neutral,Agree,Disagree,Disagree
2,Female,29 - 34 Years Old,Chinese,Homemaker,"Less than RM30,000",Yes,Yes,Yes,5 hours – 10 hours,No,...,Strongly disagree,Disagree,Strongly disagree,Strongly disagree,Strongly disagree,Strongly disagree,Strongly disagree,Strongly disagree,Strongly disagree,Strongly disagree
3,Male,35 - 40 Years Old,Malay,Businessman,"More than RM90,001",Yes,Yes,No,1 hour – 4 hours,Yes,...,Agree,Neutral,Neutral,Neutral,Agree,Agree,Neutral,Agree,Neutral,Agree
4,Female,23 - 28 Years Old,Chinese,Employee,"RM70,001-RM90,000",Yes,Yes,Yes,10 hours – 20 hours,No,...,Agree,Agree,Neutral,Disagree,Neutral,Neutral,Neutral,Agree,Agree,Agree


In [None]:
#Renaming the attributes of the dataframe
df2.rename(columns={'Annual Income': 'Annual_Income',
                   'Social Media usage': 'Social_Media_usage',
                   'Experience of purchasing due to social media influence': 'Experience',
                   'Reviews and rating impact on purchasing decision': 'Reviews_and_rating',
                   'Average time spent on the Internet (weekly)': 'ATI',
                   'Attention to advertisement on social media': 'ADSM',
                   'Purchase Behaviour 1 (PB1)': 'PB1',
                   'Purchase Behaviour 2 (PB2)': 'PB2',
                   'Purchase Behaviour 3 (PB3)': 'PB3',
                   'Purchase Behaviour 4 (PB4)': 'PB4',
                   'Attitude 1 (ATTD1)': 'ATTD1',
                   'Attitude 2 (ATTD2)': 'ATTD2',
                   'Attitude 3 (ATTD3)': 'ATTD3',
                   'Attitude 4 (ATTD4)': 'ATTD4',
                   'Social Norms 1 (SN1)': 'SN1',
                   'Social Norms 2 (SN2)': 'SN2',
                   'Social Norms 3 (SN3)': 'SN3',
                   'Social Norms 4 (SN4)': 'SN4',
                   'Perceived Behavioural Control 1 (PBC1)': 'PBC1',
                   'Perceived Behavioural Control 2 (PBC2)': 'PBC2',
                   'Perceived Behavioural Control 3 (PBC3)': 'PBC3',
                   'Perceived Behavioural Control 4 (PBC4)': 'PBC4',
                   },
          inplace=True, errors='raise')

# DATA PREPROCESSING

In [None]:
#replacing the categorical value of PB, ATTD, SN and PBC to numerical value
df2 = df2.replace('Strongly agree',5)
df2 = df2.replace('Agree',4)
df2 = df2.replace('Neutral',3)
df2 = df2.replace('Disagree',2)
df2 = df2.replace('Strongly disagree',1)

In [None]:
# Merging PB1, PB2, PB3, PB4 to PB
df2['PB'] = df2['PB1'] + df2['PB2']+ df2['PB3']+ df2['PB4']

In [None]:
#Diving the PB target attribute to two class High and Low,
#and Appending a new target attribute PB-inf
rating = []
for row in df2['PB']:
    if row<=10:
        rating.append('LOW')
    else:
        rating.append('HIGH')
df2['PB-inf'] = rating

In [None]:
df2.head()

Unnamed: 0,Gender,Age,Ethnic,Occupation,Annual_Income,Social_Media_usage,Experience,Reviews_and_rating,ATI,ADSM,...,SN1,SN2,SN3,SN4,PBC1,PBC2,PBC3,PBC4,PB,PB-inf
0,Female,29 - 34 Years Old,Chinese,Homemaker,"RM50,001-RM70,000",Yes,Yes,Yes,More than 40 hours,Yes,...,3,3,3,3,3,3,3,3,12,HIGH
1,Male,23 - 28 Years Old,Chinese,Student,"RM50,001-RM70,000",Yes,Yes,Yes,More than 40 hours,Yes,...,1,2,4,3,3,4,2,2,15,HIGH
2,Female,29 - 34 Years Old,Chinese,Homemaker,"Less than RM30,000",Yes,Yes,Yes,5 hours – 10 hours,No,...,1,1,1,1,1,1,1,1,5,LOW
3,Male,35 - 40 Years Old,Malay,Businessman,"More than RM90,001",Yes,Yes,No,1 hour – 4 hours,Yes,...,3,3,4,4,3,4,3,4,15,HIGH
4,Female,23 - 28 Years Old,Chinese,Employee,"RM70,001-RM90,000",Yes,Yes,Yes,10 hours – 20 hours,No,...,3,2,3,3,3,4,4,4,16,HIGH


In [None]:
#Droping  PB1,PB2,PB3,PB4,PB and keeping PB-inf instead
df3= df2.drop(columns = ['PB1','PB2','PB3','PB4','PB'], inplace = True)
print(df2.shape)
df2.head()

(219, 23)


Unnamed: 0,Gender,Age,Ethnic,Occupation,Annual_Income,Social_Media_usage,Experience,Reviews_and_rating,ATI,ADSM,...,ATTD4,SN1,SN2,SN3,SN4,PBC1,PBC2,PBC3,PBC4,PB-inf
0,Female,29 - 34 Years Old,Chinese,Homemaker,"RM50,001-RM70,000",Yes,Yes,Yes,More than 40 hours,Yes,...,3,3,3,3,3,3,3,3,3,HIGH
1,Male,23 - 28 Years Old,Chinese,Student,"RM50,001-RM70,000",Yes,Yes,Yes,More than 40 hours,Yes,...,3,1,2,4,3,3,4,2,2,HIGH
2,Female,29 - 34 Years Old,Chinese,Homemaker,"Less than RM30,000",Yes,Yes,Yes,5 hours – 10 hours,No,...,2,1,1,1,1,1,1,1,1,LOW
3,Male,35 - 40 Years Old,Malay,Businessman,"More than RM90,001",Yes,Yes,No,1 hour – 4 hours,Yes,...,3,3,3,4,4,3,4,3,4,HIGH
4,Female,23 - 28 Years Old,Chinese,Employee,"RM70,001-RM90,000",Yes,Yes,Yes,10 hours – 20 hours,No,...,4,3,2,3,3,3,4,4,4,HIGH


In [None]:
#According to Theory of Planned behaviour, in order to predict Purchase Behaviour-PB we only need find the relationship between ATTD, SN, PBC
#Hence Droping all the column except ATTD, SN, PBC to Predict PB
colsToDrop = ['Gender', 'Age', 'Ethnic','Occupation','Annual_Income','Social_Media_usage','Experience','Reviews_and_rating','ATI','ADSM']
df2.drop(columns = colsToDrop, inplace = True)
y = df2['PB-inf']
X = df2.drop(columns = 'PB-inf')

In [None]:
df2.head()

Unnamed: 0,ATTD1,ATTD2,ATTD3,ATTD4,SN1,SN2,SN3,SN4,PBC1,PBC2,PBC3,PBC4,PB-inf
0,3,3,3,3,3,3,3,3,3,3,3,3,HIGH
1,4,3,4,3,1,2,4,3,3,4,2,2,HIGH
2,3,1,1,2,1,1,1,1,1,1,1,1,LOW
3,4,4,4,3,3,3,4,4,3,4,3,4,HIGH
4,3,4,4,4,3,2,3,3,3,4,4,4,HIGH


# Hold Out Validation

In [None]:
#Spliting Ratio 80:20
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 20, stratify = y)

# Model

In [None]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC


In [None]:
# Initializing classifiers with default parameters
classifiers = {
    'KNN': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Logistic Regression': LogisticRegression(),
    'Naive Bayes': GaussianNB(),
    'SVM': SVC(),
    'AdaBoost': AdaBoostClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()
}

# Defining hyperparameter search spaces for each classifier
param_grids = {
    'KNN': {'n_neighbors': np.arange(1, 11)},
    'Decision Tree': {'max_depth': np.arange(1, 11), 'criterion': ['gini', 'entropy']},
    'Random Forest': {'n_estimators': [10, 50, 100, 200], 'max_depth': np.arange(1, 11), 'criterion': ['gini', 'entropy']},
    'Logistic Regression': {'C': np.logspace(-4, 4, 9), 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']},
    'Naive Bayes': {},
    'SVM': {'C': np.logspace(-3, 3, 7), 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['scale', 'auto']},
    'AdaBoost': {'n_estimators': [10, 50, 100, 200], 'learning_rate': [0.001, 0.01, 0.1, 1]},
    'Gradient Boosting': {'n_estimators': [10, 50, 100, 200], 'learning_rate': [0.001, 0.01, 0.1, 1], 'max_depth': np.arange(1, 11)}
}

# Performing hyperparameter tuning for each classifier using GridSearchCV
best_classifiers = {}
for classifier_name, classifier in classifiers.items():
    grid_search = GridSearchCV(classifier, param_grids[classifier_name], scoring='accuracy', cv=5, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_classifiers[classifier_name] = grid_search.best_estimator_
    print(f"{classifier_name}: Best parameters - {grid_search.best_params_}, Best score - {grid_search.best_score_}")


KNN: Best parameters - {'n_neighbors': 5}, Best score - 0.9485714285714286
Decision Tree: Best parameters - {'criterion': 'entropy', 'max_depth': 5}, Best score - 0.9428571428571428
Random Forest: Best parameters - {'criterion': 'gini', 'max_depth': 6, 'n_estimators': 100}, Best score - 0.9485714285714286
Logistic Regression: Best parameters - {'C': 0.1, 'solver': 'newton-cg'}, Best score - 0.9485714285714286
Naive Bayes: Best parameters - {}, Best score - 0.9200000000000002
SVM: Best parameters - {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}, Best score - 0.9485714285714286
AdaBoost: Best parameters - {'learning_rate': 0.1, 'n_estimators': 50}, Best score - 0.9485714285714286
Gradient Boosting: Best parameters - {'learning_rate': 0.01, 'max_depth': 2, 'n_estimators': 200}, Best score - 0.9542857142857143
