# Experiment 3-2: Comparative Performance Analysis with Existing Classifiers (with Class Imbalance Handling)

In [1]:
# Importing necessary libraries and packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from custom_functions.RepetitiveTasks import imputation, remove_class, encoding, separating, oversampling, classification_evaluation

# 2. Car Evaluation Dataset

In [2]:
# Loading the dataset
df = pd.read_csv("datasets/car.csv")

In [3]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [4]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(1728, 7)

In [5]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [6]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

unacc    1210
acc       384
good       69
vgood      65
Name: class, dtype: int64

In [7]:
# Encoding the attributes of the dataset
df = encoding(df)

In [8]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [9]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [10]:
# Handing class imbalance problem using SMOTEN
X, y = oversampling(X,y)

In [11]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test)

In [13]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.992769,0.992769,0.992769,0.992769,0.995234,0.990356,0.022302
1,K-Nearest Neighbours,0.924587,0.924587,0.924587,0.924587,0.979563,0.901967,0.076127
2,Support Vector Machine,0.990702,0.990702,0.990702,0.990702,0.999832,0.987626,1.158211
3,Naive Bayes Classifier,0.86157,0.86157,0.86157,0.86157,0.975699,0.817368,0.018624
4,Logistic Regression,0.57438,0.57438,0.57438,0.57438,0.777669,0.43764,0.069787
5,Multi Layer Perceptron,0.996901,0.996901,0.996901,0.996901,0.999972,0.995867,4.677628
6,AdaBoost Classifier,0.80062,0.80062,0.80062,0.80062,0.888218,0.744514,0.341789
7,Random Forest,0.993802,0.993802,0.993802,0.993802,0.999914,0.991744,0.611062
8,Gradient Boosting,0.979339,0.979339,0.979339,0.979339,0.999593,0.97258,2.468991
9,Extra Trees,0.990702,0.990702,0.990702,0.990702,0.999838,0.9876,0.593945


In [14]:
for row in res[1]:
    print(row[0],':')
    print(row[1])

Decision Tree :
[[234   1   1   0]
 [  2 225   0   0]
 [  3   0 264   0]
 [  0   0   0 238]]
K-Nearest Neighbours :
[[223   6   4   3]
 [  2 222   0   3]
 [ 42   8 212   5]
 [  0   0   0 238]]
Support Vector Machine :
[[234   2   0   0]
 [  0 226   0   1]
 [  5   1 261   0]
 [  0   0   0 238]]
Naive Bayes Classifier :
[[161  32  22  21]
 [  0 216   0  11]
 [ 34   1 232   0]
 [  0  13   0 225]]
Logistic Regression :
[[ 42  87  52  55]
 [ 51 133  42   1]
 [ 56  52 143  16]
 [  0   0   0 238]]
Multi Layer Perceptron :
[[235   0   0   1]
 [  0 226   0   1]
 [  0   1 266   0]
 [  0   0   0 238]]
AdaBoost Classifier :
[[ 93  24 106  13]
 [  0 217   0  10]
 [ 20  20 227   0]
 [  0   0   0 238]]
Random Forest :
[[235   1   0   0]
 [  2 225   0   0]
 [  3   0 264   0]
 [  0   0   0 238]]
Gradient Boosting :
[[233   2   0   1]
 [  0 217   0  10]
 [  6   1 260   0]
 [  0   0   0 238]]
Extra Trees :
[[233   1   2   0]
 [  4 223   0   0]
 [  2   0 265   0]
 [  0   0   0 238]]
XGBoost :
[[234   2   

# 3. Breast Cancer Dataset

In [15]:
# Loading the dataset
df = pd.read_csv("datasets/breast-cancer.csv")

In [16]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no


In [17]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(286, 10)

In [18]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [19]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

no-recurrence-events    201
recurrence-events        85
Name: class, dtype: int64

In [20]:
# Encoding the attributes of the dataset
df = encoding(df)

In [21]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [22]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [23]:
# Handing class imbalance problem using SMOTEN
X, y = oversampling(X,y)

In [24]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test)

XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


In [26]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.703704,0.703704,0.703704,0.703704,0.702778,0.403814,0.010436
1,K-Nearest Neighbours,0.753086,0.753086,0.753086,0.753086,0.758333,0.51382,0.012753
2,Support Vector Machine,0.641975,0.641975,0.641975,0.641975,0.65,0.3,0.029459
3,Naive Bayes Classifier,0.679012,0.679012,0.679012,0.679012,0.677778,0.354029,0.008999
4,Logistic Regression,0.679012,0.679012,0.679012,0.679012,0.680556,0.358902,0.011757
5,Multi Layer Perceptron,0.765432,0.765432,0.765432,0.765432,0.769444,0.535593,16.869735
6,AdaBoost Classifier,0.641975,0.641975,0.641975,0.641975,0.644444,0.287122,0.111223
7,Random Forest,0.777778,0.777778,0.777778,0.777778,0.772222,0.548179,0.190617
8,Gradient Boosting,0.716049,0.716049,0.716049,0.716049,0.716667,0.430946,0.07253
9,Extra Trees,0.802469,0.802469,0.802469,0.802469,0.794444,0.598574,0.112906


In [27]:
for row in res[1]:
    print(row[0],':')
    print(row[1])

Decision Tree :
[[32 13]
 [11 25]]
K-Nearest Neighbours :
[[32 13]
 [ 7 29]]
Support Vector Machine :
[[26 19]
 [10 26]]
Naive Bayes Classifier :
[[31 14]
 [12 24]]
Logistic Regression :
[[30 15]
 [11 25]]
Multi Layer Perceptron :
[[33 12]
 [ 7 29]]
AdaBoost Classifier :
[[28 17]
 [12 24]]
Random Forest :
[[37  8]
 [10 26]]
Gradient Boosting :
[[32 13]
 [10 26]]
Extra Trees :
[[39  6]
 [10 26]]
Custom Naive Bayes Classifier :
[[31 14]
 [10 26]]
Count-Based Classifier :
[[31 14]
 [ 7 29]]


# 5. Tic-Tac-Toe Endgame Dataset

In [28]:
# Loading the dataset
df = pd.read_csv("datasets/tic-tac-toe.csv")

In [29]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,top-left-square,top-middle-square,top-right-square,middle-left-square,middle-middle-square,middle-right-square,bottom-left-square,bottom-middle-square,bottom-right-square,class
0,x,x,x,x,o,o,x,o,o,positive
1,x,x,x,x,o,o,o,x,o,positive
2,x,x,x,x,o,o,o,o,x,positive
3,x,x,x,x,o,o,o,b,b,positive
4,x,x,x,x,o,o,b,o,b,positive


In [30]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(958, 10)

In [31]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [32]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

positive    626
negative    332
Name: class, dtype: int64

In [33]:
# Encoding the attributes of the dataset
df = encoding(df)

In [34]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [35]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [36]:
# Handing class imbalance problem using SMOTEN
X, y = oversampling(X,y)

In [37]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [38]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test)

XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


In [39]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.904382,0.904382,0.904382,0.904382,0.904413,0.808877,0.011615
1,K-Nearest Neighbours,0.820717,0.820717,0.820717,0.820717,0.820413,0.648699,0.026632
2,Support Vector Machine,0.944223,0.944223,0.944223,0.944223,0.944286,0.88891,0.180446
3,Naive Bayes Classifier,0.685259,0.685259,0.685259,0.685259,0.685302,0.370674,0.008218
4,Logistic Regression,0.61753,0.61753,0.61753,0.61753,0.617714,0.236421,0.008029
5,Multi Layer Perceptron,0.788845,0.788845,0.788845,0.788845,0.788857,0.577714,3.384473
6,AdaBoost Classifier,0.749004,0.749004,0.749004,0.749004,0.749079,0.498475,0.124639
7,Random Forest,0.976096,0.976096,0.976096,0.976096,0.976063,0.952308,0.245389
8,Gradient Boosting,0.936255,0.936255,0.936255,0.936255,0.936222,0.872611,0.121853
9,Extra Trees,0.964143,0.964143,0.964143,0.964143,0.964095,0.928544,0.141149


In [40]:
for row in res[1]:
    print(row[0],':')
    print(row[1])

Decision Tree :
[[114  11]
 [ 13 113]]
K-Nearest Neighbours :
[[ 93  32]
 [ 13 113]]
Support Vector Machine :
[[120   5]
 [  9 117]]
Naive Bayes Classifier :
[[87 38]
 [41 85]]
Logistic Regression :
[[83 42]
 [54 72]]
Multi Layer Perceptron :
[[99 26]
 [27 99]]
AdaBoost Classifier :
[[96 29]
 [34 92]]
Random Forest :
[[121   4]
 [  2 124]]
Gradient Boosting :
[[116   9]
 [  7 119]]
Extra Trees :
[[119   6]
 [  3 123]]
Custom Naive Bayes Classifier :
[[87 38]
 [41 85]]
Count-Based Classifier :
[[92 33]
 [48 78]]


# 7. Soybean (Large) Dataset

In [41]:
# Loading the dataset
df = pd.read_csv("datasets/soybean-large.csv")

In [42]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,class,date,plant-stand,precip,temp,hail,crop-hist,area-damaged,severity,seed-tmt,...,int-discolor,sclerotia,fruit-pods,fruit spots,seed,mold-growth,seed-discolor,seed-size,shriveling,roots
0,diaporthe-stem-canker,6,0,2,1,0,1,1,1,0,...,0,0,0,4,0,0,0,0,0,0
1,diaporthe-stem-canker,4,0,2,1,0,2,0,2,1,...,0,0,0,4,0,0,0,0,0,0
2,diaporthe-stem-canker,3,0,2,1,0,1,0,2,1,...,0,0,0,4,0,0,0,0,0,0
3,diaporthe-stem-canker,3,0,2,1,0,1,0,2,0,...,0,0,0,4,0,0,0,0,0,0
4,diaporthe-stem-canker,6,0,2,1,0,2,0,1,0,...,0,0,0,4,0,0,0,0,0,0


In [43]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(307, 36)

In [44]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [45]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

frog-eye-leaf-spot             40
phytophthora-rot               40
alternarialeaf-spot            40
brown-spot                     40
brown-stem-rot                 20
anthracnose                    20
diaporthe-stem-canker          10
purple-seed-stain              10
phyllosticta-leaf-spot         10
bacterial-pustule              10
charcoal-rot                   10
bacterial-blight               10
downy-mildew                   10
powdery-mildew                 10
rhizoctonia-root-rot           10
diaporthe-pod-&-stem-blight     6
cyst-nematode                   6
herbicide-injury                4
2-4-d-injury                    1
Name: class, dtype: int64

In [46]:
# Removing classes having <= 4 instances
df = remove_class(df,'class')

In [47]:
# Encoding the attributes of the dataset
df = encoding(df)

In [48]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [49]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [50]:
# Handing class imbalance problem using SMOTEN
X, y = oversampling(X,y)

In [51]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [52]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test)

In [53]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.948529,0.948529,0.948529,0.948529,0.973381,0.945187,0.014381
1,K-Nearest Neighbours,0.919118,0.919118,0.919118,0.919118,0.994954,0.9144,0.053505
2,Support Vector Machine,0.955882,0.955882,0.955882,0.955882,0.998673,0.953072,0.12697
3,Naive Bayes Classifier,0.963235,0.963235,0.963235,0.963235,0.998555,0.960749,0.02875
4,Logistic Regression,0.926471,0.926471,0.926471,0.926471,0.998918,0.922116,2.145484
5,Multi Layer Perceptron,0.955882,0.955882,0.955882,0.955882,0.998844,0.953218,7.003751
6,AdaBoost Classifier,0.308824,0.308824,0.308824,0.308824,0.800248,0.36097,0.140337
7,Random Forest,0.977941,0.977941,0.977941,0.977941,0.999438,0.976588,0.176299
8,Gradient Boosting,0.963235,0.963235,0.963235,0.963235,0.999289,0.96141,1.351196
9,Extra Trees,0.970588,0.970588,0.970588,0.970588,0.999245,0.968665,0.067432


In [54]:
for row in res[1]:
    print(row[0],':')
    print(row[1])

Decision Tree :
[[ 6  0  0  0  0  0  0  0  0  0  0  3  0  0  0  0  0]
 [ 0 10  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  6  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  7  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0 10  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0 12  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  5  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  5  0  0  0  0  0  0  0  0  0]
 [ 1  0  0  0  0  0  0  0  9  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  6  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  8  0  0  0  0  0  0]
 [ 1  0  0  0  1  0  0  0  0  0  0  6  0  0  0  0  0]
 [ 0  0  0  0  1  0  0  0  0  0  0  0  5  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0 14  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  8  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  5  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  7]]
K-Nearest Neighbours :
[[ 8  0  0  0  0  0  0  0  0  0  0  0  1  

# 11. Molecular Biology (Splice-junction Gene Sequences) Dataset

In [55]:
# Loading the dataset
df = pd.read_csv("datasets/splice.csv")

In [56]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,class,instance_name,sequence
0,EI,ATRINS-DONOR-521,CCAGCTGCATCACAGGAGGCCAGCGAGCAGG...
1,EI,ATRINS-DONOR-905,AGACCCGCCGGGAGGCGGAGGACCTGCAGGG...
2,EI,BABAPOE-DONOR-30,GAGGTGAAGGACGTCCTTCCCCAGGAGCCGG...
3,EI,BABAPOE-DONOR-867,GGGCTGCGTTGCTGGTCACATTCCTGGCAGGT...
4,EI,BABAPOE-DONOR-2817,GCTCAGCCCCCAGGTCACCCAGGAACTGACGTG...


In [57]:
# Formatting columns to create a proper dataset
df['sequence'] = df['sequence'].apply(lambda x: x.replace(' ',''))
attributes = [str(i) for i in range(0,62)]
df[attributes] = df['sequence'].str.split(pat='', expand=True)
df.drop(['sequence','0','61'],axis=1,inplace=True)

In [58]:
# Dropping unwanted columns
df.drop(['instance_name'], axis=1, inplace=True)

In [59]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,class,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
0,EI,C,C,A,G,C,T,G,C,A,...,A,G,C,C,A,G,T,C,T,G
1,EI,A,G,A,C,C,C,G,C,C,...,G,T,G,C,C,C,C,C,G,C
2,EI,G,A,G,G,T,G,A,A,G,...,C,A,C,G,G,G,G,A,T,G
3,EI,G,G,G,C,T,G,C,G,T,...,G,G,T,T,T,T,C,C,C,C
4,EI,G,C,T,C,A,G,C,C,C,...,C,C,T,T,G,A,C,C,C,T


In [60]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(3190, 61)

In [61]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [62]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

N     1655
IE     768
EI     767
Name: class, dtype: int64

In [63]:
# Encoding the attributes of the dataset
df = encoding(df)

In [64]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [65]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [66]:
# Handing class imbalance problem using SMOTEN
X, y = oversampling(X,y)

In [67]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [68]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test)

In [69]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.930514,0.930514,0.930514,0.930514,0.947684,0.896006,0.040097
1,K-Nearest Neighbours,0.760322,0.760322,0.760322,0.760322,0.930917,0.672766,0.043736
2,Support Vector Machine,0.908359,0.908359,0.908359,0.908359,0.981086,0.862695,3.103941
3,Naive Bayes Classifier,0.962739,0.962739,0.962739,0.962739,0.996314,0.944153,0.014719
4,Logistic Regression,0.873112,0.873112,0.873112,0.873112,0.962698,0.810244,0.215924
5,Multi Layer Perceptron,0.90433,0.90433,0.90433,0.90433,0.980116,0.857137,6.931376
6,AdaBoost Classifier,0.929507,0.929507,0.929507,0.929507,0.969244,0.894383,0.47678
7,Random Forest,0.964753,0.964753,0.964753,0.964753,0.995876,0.947404,0.501013
8,Gradient Boosting,0.976838,0.976838,0.976838,0.976838,0.997816,0.965437,2.979753
9,Extra Trees,0.975831,0.975831,0.975831,0.975831,0.997278,0.963811,0.393025


In [70]:
for row in res[1]:
    print(row[0],':')
    print(row[1])

Decision Tree :
[[323   8   5]
 [  7 311  14]
 [ 12  23 290]]
K-Nearest Neighbours :
[[330   6   0]
 [ 33 293   6]
 [140  53 132]]
Support Vector Machine :
[[320   5  11]
 [  8 303  21]
 [ 17  29 279]]
Naive Bayes Classifier :
[[322   7   7]
 [  2 320  10]
 [  7   4 314]]
Logistic Regression :
[[304  13  19]
 [ 19 298  15]
 [ 21  39 265]]
Multi Layer Perceptron :
[[315  13   8]
 [  7 307  18]
 [ 14  35 276]]
AdaBoost Classifier :
[[319   8   9]
 [ 10 310  12]
 [ 18  13 294]]
Random Forest :
[[331   3   2]
 [  2 325   5]
 [  8  15 302]]
Gradient Boosting :
[[334   1   1]
 [  3 327   2]
 [  7   9 309]]
Extra Trees :
[[331   4   1]
 [  2 326   4]
 [  6   7 312]]
XGBoost :
[[333   2   1]
 [  2 328   2]
 [  7   9 309]]
Custom Naive Bayes Classifier :
[[322   7   7]
 [  2 320  10]
 [  7   4 314]]
Count-Based Classifier :
[[325  10   1]
 [  3 329   0]
 [108 119  98]]


# 13. Primary Tumor Dataset

In [71]:
# Loading the dataset
df = pd.read_csv("datasets/primary-tumor.csv")

In [72]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,class,age,sex,histologic-type,degree-of-diffe,bone,bone-marrow,lung,pleura,peritoneum,liver,brain,skin,neck,supraclavicular,axillar,mediastinum,abdominal
0,1,1,1,?,3,2,2,1,2,2,2,2,2,2,2,2,2,2
1,1,1,1,?,3,2,2,2,2,2,1,2,2,2,1,2,1,2
2,1,1,2,2,3,1,2,2,2,2,2,2,2,2,2,2,1,2
3,1,1,2,?,3,1,2,1,1,2,2,2,2,2,2,2,1,2
4,1,1,2,?,3,1,2,1,1,2,2,2,2,2,2,2,1,2


In [73]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(339, 18)

In [74]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [75]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

1     84
5     39
18    29
11    28
14    24
22    24
2     20
12    16
7     14
4     14
17    10
3      9
13     7
8      6
19     6
10     2
15     2
20     2
6      1
16     1
21     1
Name: class, dtype: int64

In [76]:
# Removing classes having <= 4 instances
df = remove_class(df,'class')

In [77]:
# Encoding the attributes of the dataset
df = encoding(df)

In [78]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [79]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [80]:
# Handing class imbalance problem using SMOTEN
X, y = oversampling(X,y)

In [81]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [82]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test)

In [83]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.809524,0.809524,0.809524,0.809524,0.913762,0.797341,0.010627
1,K-Nearest Neighbours,0.81746,0.81746,0.81746,0.81746,0.95357,0.805914,0.021587
2,Support Vector Machine,0.845238,0.845238,0.845238,0.845238,0.98132,0.835927,0.141944
3,Naive Bayes Classifier,0.742063,0.742063,0.742063,0.742063,0.969233,0.72605,0.015802
4,Logistic Regression,0.761905,0.761905,0.761905,0.761905,0.976251,0.747401,0.152891
5,Multi Layer Perceptron,0.837302,0.837302,0.837302,0.837302,0.981589,0.826957,4.608114
6,AdaBoost Classifier,0.305556,0.305556,0.305556,0.305556,0.741332,0.287365,0.153721
7,Random Forest,0.833333,0.833333,0.833333,0.833333,0.973056,0.822988,0.196105
8,Gradient Boosting,0.829365,0.829365,0.829365,0.829365,0.977693,0.818602,1.151341
9,Extra Trees,0.829365,0.829365,0.829365,0.829365,0.956435,0.818551,0.089785


In [84]:
for row in res[1]:
    print(row[0],':')
    print(row[1])

Decision Tree :
[[ 7  0  0  1  2  0  0  1  2  1  0  1  1  0  0]
 [ 1 16  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0 16  0  0  0  0  0  0  0  0  1  0  0  0]
 [ 0  0  0 16  0  0  0  0  0  0  0  0  0  0  1]
 [ 1  0  0  0  7  3  0  0  0  1  0  1  2  0  2]
 [ 0  0  0  0  0 16  0  0  0  0  0  0  1  0  0]
 [ 0  0  0  0  0  0 17  0  0  0  0  0  0  0  0]
 [ 1  0  0  0  1  0  0 13  1  0  0  0  1  0  0]
 [ 1  0  0  0  0  0  0  0 16  0  0  0  0  0  0]
 [ 0  0  0  0  1  0  0  0  0 13  2  0  0  0  0]
 [ 0  0  0  1  2  0  0  0  0  0  6  8  0  0  0]
 [ 0  0  0  0  1  0  0  0  0  0  1 14  0  0  0]
 [ 0  0  0  0  0  0  0  0  2  0  0  0 15  0  0]
 [ 1  0  0  0  0  0  0  0  0  0  0  0  0 16  0]
 [ 1  0  0  0  0  0  0  0  0  0  0  0  0  0 16]]
K-Nearest Neighbours :
[[ 6  0  1  2  1  0  1  2  1  0  0  1  1  0  0]
 [ 0 16  1  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0 16  0  0  0  0  0  0  0  0  1  0  0  0]
 [ 1  0  0 16  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  9  2  1  0  0  0  0  0  3  0  2]


# 14. Chess (King-Rook vs. King-Pawn) Dataset

In [85]:
# Loading the dataset
df = pd.read_csv("datasets/kr-vs-kp.csv")

In [86]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,bkblk,bknwy,bkon8,bkona,bkspr,bkxbq,bkxcr,bkxwp,blxwp,bxqsq,...,spcop,stlmt,thrsk,wkcti,wkna8,wknck,wkovl,wkpos,wtoeg,class
0,f,f,f,f,f,f,f,f,f,f,...,f,f,f,f,f,f,t,t,n,won
1,f,f,f,f,t,f,f,f,f,f,...,f,f,f,f,f,f,t,t,n,won
2,f,f,f,f,t,f,t,f,f,f,...,f,f,f,f,f,f,t,t,n,won
3,f,f,f,f,f,f,f,f,t,f,...,f,f,f,f,f,f,t,t,n,won
4,f,f,f,f,f,f,f,f,f,f,...,f,f,f,f,f,f,t,t,n,won


In [87]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(3196, 37)

In [88]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [89]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

won      1669
nowin    1527
Name: class, dtype: int64

In [90]:
# Encoding the attributes of the dataset
df = encoding(df)

In [91]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [92]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [93]:
# Handing class imbalance problem using SMOTEN
X, y = oversampling(X,y)

In [94]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [95]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test)

XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


In [96]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.997006,0.997006,0.997006,0.997006,0.996942,0.994026,0.0085
1,K-Nearest Neighbours,0.958084,0.958084,0.958084,0.958084,0.957877,0.916247,0.031604
2,Support Vector Machine,0.961078,0.961078,0.961078,0.961078,0.961124,0.922148,0.75005
3,Naive Bayes Classifier,0.868263,0.868263,0.868263,0.868263,0.868331,0.73653,0.012331
4,Logistic Regression,0.952096,0.952096,0.952096,0.952096,0.9522,0.904238,0.024847
5,Multi Layer Perceptron,0.995509,0.995509,0.995509,0.995509,0.995476,0.991018,3.652279
6,AdaBoost Classifier,0.958084,0.958084,0.958084,0.958084,0.958003,0.916137,0.199877
7,Random Forest,0.980539,0.980539,0.980539,0.980539,0.980373,0.961156,0.261581
8,Gradient Boosting,0.964072,0.964072,0.964072,0.964072,0.964182,0.928197,0.278006
9,Extra Trees,0.989521,0.989521,0.989521,0.989521,0.989359,0.979136,0.183178


In [97]:
for row in res[1]:
    print(row[0],':')
    print(row[1])

Decision Tree :
[[325   2]
 [  0 341]]
K-Nearest Neighbours :
[[310  17]
 [ 11 330]]
Support Vector Machine :
[[315  12]
 [ 14 327]]
Naive Bayes Classifier :
[[285  42]
 [ 46 295]]
Logistic Regression :
[[313  14]
 [ 18 323]]
Multi Layer Perceptron :
[[325   2]
 [  1 340]]
AdaBoost Classifier :
[[312  15]
 [ 13 328]]
Random Forest :
[[318   9]
 [  4 337]]
Gradient Boosting :
[[317  10]
 [ 14 327]]
Extra Trees :
[[321   6]
 [  1 340]]
Custom Naive Bayes Classifier :
[[285  42]
 [ 46 295]]
Count-Based Classifier :
[[276  51]
 [ 56 285]]


# 15. Lymphography Dataset

In [98]:
# Loading the dataset
df = pd.read_csv("datasets/lymphography.csv")

In [99]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,class,lymphatics,block of affere,bl. of lymph. c,bl. of lymph. s,by pass,extravasates,regeneration of,early uptake in,lym.nodes dimin,lym.nodes enlar,changes in lym.,defect in node,changes in node,changes in stru,special forms,dislocation of,exclusion of no,no. of nodes in
0,3,4,2,1,1,1,1,1,2,1,2,2,2,4,8,1,1,2,2
1,2,3,2,1,1,2,2,1,2,1,3,3,2,3,4,2,2,2,2
2,3,3,2,2,2,2,2,2,2,1,4,3,3,4,8,3,2,2,7
3,3,3,1,1,1,1,2,1,2,1,3,3,4,4,4,3,1,2,6
4,2,3,1,1,1,1,1,1,1,1,2,2,4,3,5,1,2,2,1


In [100]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(148, 19)

In [101]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [102]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

2    81
3    61
4     4
1     2
Name: class, dtype: int64

In [103]:
# Removing classes having <= 4 instances
df = remove_class(df,'class')

In [104]:
# Encoding the attributes of the dataset
df = encoding(df)

In [105]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [106]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [107]:
# Handing class imbalance problem using SMOTEN
X, y = oversampling(X,y)

In [108]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [109]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test)

XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


In [110]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.787879,0.787879,0.787879,0.787879,0.786765,0.57565,0.005539
1,K-Nearest Neighbours,0.848485,0.848485,0.848485,0.848485,0.851103,0.710081,0.012168
2,Support Vector Machine,0.909091,0.909091,0.909091,0.909091,0.908088,0.819194,0.007246
3,Naive Bayes Classifier,0.848485,0.848485,0.848485,0.848485,0.847426,0.697422,0.005673
4,Logistic Regression,0.848485,0.848485,0.848485,0.848485,0.851103,0.710081,0.008228
5,Multi Layer Perceptron,0.878788,0.878788,0.878788,0.878788,0.882353,0.782154,0.36116
6,AdaBoost Classifier,0.848485,0.848485,0.848485,0.848485,0.849265,0.698529,0.085476
7,Random Forest,0.848485,0.848485,0.848485,0.848485,0.847426,0.697422,0.152388
8,Gradient Boosting,0.848485,0.848485,0.848485,0.848485,0.847426,0.697422,0.067237
9,Extra Trees,0.909091,0.909091,0.909091,0.909091,0.909926,0.819853,0.094582


In [111]:
for row in res[1]:
    print(row[0],':')
    print(row[1])

Decision Tree :
[[14  3]
 [ 4 12]]
K-Nearest Neighbours :
[[13  4]
 [ 1 15]]
Support Vector Machine :
[[16  1]
 [ 2 14]]
Naive Bayes Classifier :
[[15  2]
 [ 3 13]]
Logistic Regression :
[[13  4]
 [ 1 15]]
Multi Layer Perceptron :
[[13  4]
 [ 0 16]]
AdaBoost Classifier :
[[14  3]
 [ 2 14]]
Random Forest :
[[15  2]
 [ 3 13]]
Gradient Boosting :
[[15  2]
 [ 3 13]]
Extra Trees :
[[15  2]
 [ 1 15]]
Custom Naive Bayes Classifier :
[[15  2]
 [ 3 13]]
Count-Based Classifier :
[[13  4]
 [ 0 16]]


# 16. Connect-4 Dataset

In [112]:
# Loading the dataset
df = pd.read_csv("datasets/connect-4.csv")

In [113]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,a1,a2,a3,a4,a5,a6,b1,b2,b3,b4,...,f4,f5,f6,g1,g2,g3,g4,g5,g6,class
0,b,b,b,b,b,b,b,b,b,b,...,b,b,b,b,b,b,b,b,b,win
1,b,b,b,b,b,b,b,b,b,b,...,b,b,b,b,b,b,b,b,b,win
2,b,b,b,b,b,b,o,b,b,b,...,b,b,b,b,b,b,b,b,b,win
3,b,b,b,b,b,b,b,b,b,b,...,b,b,b,b,b,b,b,b,b,win
4,o,b,b,b,b,b,b,b,b,b,...,b,b,b,b,b,b,b,b,b,win


In [114]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(67557, 43)

In [115]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [116]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

win     44473
loss    16635
draw     6449
Name: class, dtype: int64

In [117]:
# Encoding the attributes of the dataset
df = encoding(df)

In [118]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [119]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [120]:
# Handing class imbalance problem using SMOTEN
X, y = oversampling(X,y)

In [121]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [122]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test)

In [123]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.867748,0.867748,0.867748,0.867748,0.905884,0.802191,0.387777
1,K-Nearest Neighbours,0.814121,0.814121,0.814121,0.814121,0.934873,0.726869,12.239253
2,Support Vector Machine,0.797744,0.797744,0.797744,0.797744,0.930984,0.69888,4267.567752
3,Naive Bayes Classifier,0.574951,0.574951,0.574951,0.574951,0.76049,0.364797,0.21302
4,Logistic Regression,0.589604,0.589604,0.589604,0.589604,0.756491,0.397109,24.780774
5,Multi Layer Perceptron,0.810336,0.810336,0.810336,0.810336,0.93888,0.717054,1100.079371
6,AdaBoost Classifier,0.617224,0.617224,0.617224,0.617224,0.792056,0.428338,3.363841
7,Random Forest,0.909571,0.909571,0.909571,0.909571,0.983054,0.86436,8.660888
8,Gradient Boosting,0.688952,0.688952,0.688952,0.688952,0.855229,0.534041,31.347379
9,Extra Trees,0.912832,0.912832,0.912832,0.912832,0.983639,0.869465,11.245262


In [124]:
for row in res[1]:
    print(row[0],':')
    print(row[1])

Decision Tree :
[[8025  383  420]
 [ 603 7834  537]
 [ 709  877 7296]]
K-Nearest Neighbours :
[[8155  428  245]
 [1118 7396  460]
 [1371 1338 6173]]
Support Vector Machine :
[[6841  866 1121]
 [1525 6436 1013]
 [ 380  492 8010]]
Naive Bayes Classifier :
[[5394 1619 1815]
 [2799 4718 1457]
 [2435 1217 5230]]
Logistic Regression :
[[5290 1636 1902]
 [2951 2771 3252]
 [ 297  913 7672]]
Multi Layer Perceptron :
[[7313  728  787]
 [1666 6574  734]
 [ 593  553 7736]]
AdaBoost Classifier :
[[5710 1801 1317]
 [3011 4782 1181]
 [1849 1055 5978]]
Random Forest :
[[8069  349  410]
 [ 416 8112  446]
 [ 291  501 8090]]
Gradient Boosting :
[[6105 1584 1139]
 [2181 5873  920]
 [1346 1130 6406]]
Extra Trees :
[[7990  331  507]
 [ 355 8110  509]
 [ 212  412 8258]]
XGBoost :
[[7588  752  488]
 [1315 7241  418]
 [ 603  624 7655]]
Custom Naive Bayes Classifier :
[[5394 1619 1815]
 [2799 4718 1457]
 [2435 1217 5230]]
Count-Based Classifier :
[[8109  689   30]
 [6449 2480   45]
 [7442 1146  294]]


# 19. Phishing Website Dataset

In [125]:
# Loading the dataset
df = pd.read_csv("datasets/phishing.csv")

In [126]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,Index,UsingIP,LongURL,ShortURL,Symbol@,Redirecting//,PrefixSuffix-,SubDomains,HTTPS,DomainRegLen,...,UsingPopupWindow,IframeRedirection,AgeofDomain,DNSRecording,WebsiteTraffic,PageRank,GoogleIndex,LinksPointingToPage,StatsReport,class
0,0,1,1,1,1,1,-1,0,1,-1,...,1,1,-1,-1,0,-1,1,1,1,-1
1,1,1,0,1,1,1,-1,-1,-1,-1,...,1,1,1,-1,1,-1,1,0,-1,-1
2,2,1,0,1,1,1,-1,-1,-1,1,...,1,1,-1,-1,1,-1,1,-1,1,-1
3,3,1,0,-1,1,1,-1,1,1,-1,...,-1,1,-1,-1,0,-1,1,1,1,1
4,4,-1,0,-1,1,-1,-1,1,1,-1,...,1,1,1,1,1,-1,1,-1,-1,1


In [127]:
# Dropping unwanted columns
df.drop(['Index'], axis=1, inplace=True)

In [128]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,UsingIP,LongURL,ShortURL,Symbol@,Redirecting//,PrefixSuffix-,SubDomains,HTTPS,DomainRegLen,Favicon,...,UsingPopupWindow,IframeRedirection,AgeofDomain,DNSRecording,WebsiteTraffic,PageRank,GoogleIndex,LinksPointingToPage,StatsReport,class
0,1,1,1,1,1,-1,0,1,-1,1,...,1,1,-1,-1,0,-1,1,1,1,-1
1,1,0,1,1,1,-1,-1,-1,-1,1,...,1,1,1,-1,1,-1,1,0,-1,-1
2,1,0,1,1,1,-1,-1,-1,1,1,...,1,1,-1,-1,1,-1,1,-1,1,-1
3,1,0,-1,1,1,-1,1,1,-1,1,...,-1,1,-1,-1,0,-1,1,1,1,1
4,-1,0,-1,1,-1,-1,1,1,-1,1,...,1,1,1,1,1,-1,1,-1,-1,1


In [129]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(11054, 31)

In [130]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [131]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

 1    6157
-1    4897
Name: class, dtype: int64

In [132]:
# Encoding the attributes of the dataset
df = encoding(df)

In [133]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [134]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [135]:
# Handing class imbalance problem using SMOTEN
X, y = oversampling(X,y)

In [136]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [137]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test)

XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


In [138]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.963053,0.963053,0.963053,0.963053,0.963117,0.926134,0.027767
1,K-Nearest Neighbours,0.958181,0.958181,0.958181,0.958181,0.958298,0.91647,0.14687
2,Support Vector Machine,0.954527,0.954527,0.954527,0.954527,0.954744,0.909417,4.737275
3,Naive Bayes Classifier,0.926512,0.926512,0.926512,0.926512,0.926877,0.854012,0.019767
4,Logistic Regression,0.925294,0.925294,0.925294,0.925294,0.925612,0.85135,0.293001
5,Multi Layer Perceptron,0.965489,0.965489,0.965489,0.965489,0.965635,0.931143,84.947465
6,AdaBoost Classifier,0.933821,0.933821,0.933821,0.933821,0.934093,0.86821,0.406809
7,Random Forest,0.968737,0.968737,0.968737,0.968737,0.968871,0.937611,0.534596
8,Gradient Boosting,0.943159,0.943159,0.943159,0.943159,0.943292,0.886461,0.735844
9,Extra Trees,0.972391,0.972391,0.972391,0.972391,0.972438,0.944793,0.588235


In [139]:
for row in res[1]:
    print(row[0],':')
    print(row[1])

Decision Tree :
[[1201   51]
 [  40 1171]]
K-Nearest Neighbours :
[[1191   61]
 [  42 1169]]
Support Vector Machine :
[[1179   73]
 [  39 1172]]
Naive Bayes Classifier :
[[1133  119]
 [  62 1149]]
Logistic Regression :
[[1135  117]
 [  67 1144]]
Multi Layer Perceptron :
[[1198   54]
 [  31 1180]]
AdaBoost Classifier :
[[1149  103]
 [  60 1151]]
Random Forest :
[[1203   49]
 [  28 1183]]
Gradient Boosting :
[[1171   81]
 [  59 1152]]
Extra Trees :
[[1214   38]
 [  30 1181]]
Custom Naive Bayes Classifier :
[[1133  119]
 [  62 1149]]
Count-Based Classifier :
[[1049  203]
 [  43 1168]]


# 20. Monkey-Pox Patients Dataset

In [140]:
# Loading the dataset
df = pd.read_csv("datasets/monkey-pox.csv")

In [141]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,Patient_ID,Systemic Illness,Rectal Pain,Sore Throat,Penile Oedema,Oral Lesions,Solitary Lesion,Swollen Tonsils,HIV Infection,Sexually Transmitted Infection,MonkeyPox
0,P0,,False,True,True,True,False,True,False,False,Negative
1,P1,Fever,True,False,True,True,False,False,True,False,Positive
2,P2,Fever,False,True,True,False,False,False,True,False,Positive
3,P3,,True,False,False,False,True,True,True,False,Positive
4,P4,Swollen Lymph Nodes,True,True,True,False,False,True,True,False,Positive


In [142]:
# Dropping unwanted columns
df.drop(['Patient_ID'], axis=1, inplace=True)

In [143]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,Systemic Illness,Rectal Pain,Sore Throat,Penile Oedema,Oral Lesions,Solitary Lesion,Swollen Tonsils,HIV Infection,Sexually Transmitted Infection,MonkeyPox
0,,False,True,True,True,False,True,False,False,Negative
1,Fever,True,False,True,True,False,False,True,False,Positive
2,Fever,False,True,True,False,False,False,True,False,Positive
3,,True,False,False,False,True,True,True,False,Positive
4,Swollen Lymph Nodes,True,True,True,False,False,True,True,False,Positive


In [144]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(25000, 10)

In [145]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [146]:
# Checking the instance counts of the target attribute
df['MonkeyPox'].value_counts()

Positive    15909
Negative     9091
Name: MonkeyPox, dtype: int64

In [147]:
# Encoding the attributes of the dataset
df = encoding(df)

In [148]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [149]:
# Separating dependent and independent variables
X, y = separating(df, 'MonkeyPox')

In [150]:
# Handing class imbalance problem using SMOTEN
X, y = oversampling(X,y)

In [151]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [152]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test)

XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


In [153]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.638906,0.638906,0.638906,0.638906,0.638935,0.277901,0.043986
1,K-Nearest Neighbours,0.609837,0.609837,0.609837,0.609837,0.609781,0.219679,0.500076
2,Support Vector Machine,0.642363,0.642363,0.642363,0.642363,0.64228,0.284905,122.490895
3,Naive Bayes Classifier,0.643935,0.643935,0.643935,0.643935,0.643945,0.28789,0.060897
4,Logistic Regression,0.610622,0.610622,0.610622,0.610622,0.610614,0.221231,0.107907
5,Multi Layer Perceptron,0.640949,0.640949,0.640949,0.640949,0.640977,0.281984,51.575052
6,AdaBoost Classifier,0.64362,0.64362,0.64362,0.64362,0.643645,0.287315,0.716858
7,Random Forest,0.641892,0.641892,0.641892,0.641892,0.641844,0.283804,1.393982
8,Gradient Boosting,0.642992,0.642992,0.642992,0.642992,0.642884,0.286339,1.454656
9,Extra Trees,0.638906,0.638906,0.638906,0.638906,0.638935,0.277901,1.75294


In [154]:
for row in res[1]:
    print(row[0],':')
    print(row[1])

Decision Tree :
[[2052 1119]
 [1179 2014]]
K-Nearest Neighbours :
[[1883 1288]
 [1195 1998]]
Support Vector Machine :
[[1960 1211]
 [1065 2128]]
Naive Bayes Classifier :
[[2051 1120]
 [1146 2047]]
Logistic Regression :
[[1929 1242]
 [1236 1957]]
Multi Layer Perceptron :
[[2058 1113]
 [1172 2021]]
AdaBoost Classifier :
[[2064 1107]
 [1161 2032]]
Random Forest :
[[1991 1180]
 [1099 2094]]
Gradient Boosting :
[[1940 1231]
 [1041 2152]]
Extra Trees :
[[2052 1119]
 [1179 2014]]
Custom Naive Bayes Classifier :
[[2051 1120]
 [1146 2047]]
Count-Based Classifier :
[[2161 1010]
 [1330 1863]]


# 22. Android Malware Detection

In [155]:
# Loading the dataset
df = pd.read_csv("datasets/TUANDROMD.csv")

In [156]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,ACCESS_ALL_DOWNLOADS,ACCESS_CACHE_FILESYSTEM,ACCESS_CHECKIN_PROPERTIES,ACCESS_COARSE_LOCATION,ACCESS_COARSE_UPDATES,ACCESS_FINE_LOCATION,ACCESS_LOCATION_EXTRA_COMMANDS,ACCESS_MOCK_LOCATION,ACCESS_MTK_MMHW,ACCESS_NETWORK_STATE,...,Landroid/telephony/TelephonyManager;->getLine1Number,Landroid/telephony/TelephonyManager;->getNetworkOperator,Landroid/telephony/TelephonyManager;->getNetworkOperatorName,Landroid/telephony/TelephonyManager;->getNetworkCountryIso,Landroid/telephony/TelephonyManager;->getSimOperator,Landroid/telephony/TelephonyManager;->getSimOperatorName,Landroid/telephony/TelephonyManager;->getSimCountryIso,Landroid/telephony/TelephonyManager;->getSimSerialNumber,Lorg/apache/http/impl/client/DefaultHttpClient;->execute,Label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,malware
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,malware
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,malware
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,malware
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,malware


In [157]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(4465, 242)

In [158]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [159]:
# Checking the instance counts of the target attribute
df['Label'].value_counts()

malware     3565
goodware     899
Name: Label, dtype: int64

In [160]:
# Encoding the attributes of the dataset
df = encoding(df)

In [161]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [162]:
# Separating dependent and independent variables
X, y = separating(df, 'Label')

In [163]:
# Handing class imbalance problem using SMOTEN
X, y = oversampling(X,y)

In [164]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [165]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test)

XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


In [166]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.992992,0.992992,0.992992,0.992992,0.99307,0.98599,0.069299
1,K-Nearest Neighbours,0.99089,0.99089,0.99089,0.99089,0.99103,0.981816,0.251335
2,Support Vector Machine,0.992292,0.992292,0.992292,0.992292,0.99239,0.984597,3.678959
3,Naive Bayes Classifier,0.971969,0.971969,0.971969,0.971969,0.972197,0.944049,0.07082
4,Logistic Regression,0.989488,0.989488,0.989488,0.989488,0.9895,0.97896,0.384008
5,Multi Layer Perceptron,0.99089,0.99089,0.99089,0.99089,0.990903,0.981765,33.166156
6,AdaBoost Classifier,0.988788,0.988788,0.988788,0.988788,0.988862,0.977574,0.771803
7,Random Forest,0.995795,0.995795,0.995795,0.995795,0.995834,0.991588,0.436859
8,Gradient Boosting,0.99089,0.99089,0.99089,0.99089,0.990987,0.981791,1.5134
9,Extra Trees,0.995095,0.995095,0.995095,0.995095,0.995238,0.990231,0.49952


In [167]:
for row in res[1]:
    print(row[0],':')
    print(row[1])

Decision Tree :
[[689   3]
 [  7 728]]
K-Nearest Neighbours :
[[689   3]
 [ 10 725]]
Support Vector Machine :
[[689   3]
 [  8 727]]
Naive Bayes Classifier :
[[678  14]
 [ 26 709]]
Logistic Regression :
[[685   7]
 [  8 727]]
Multi Layer Perceptron :
[[686   6]
 [  7 728]]
AdaBoost Classifier :
[[686   6]
 [ 10 725]]
Random Forest :
[[690   2]
 [  4 731]]
Gradient Boosting :
[[688   4]
 [  9 726]]
Extra Trees :
[[692   0]
 [  7 728]]
Custom Naive Bayes Classifier :
[[678  14]
 [ 26 709]]
Count-Based Classifier :
[[692   0]
 [296 439]]
