# Experiment 3-1: Comparative Performance Analysis with Existing Classifiers

In [1]:
# Importing necessary libraries and packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from custom_functions.RepetitiveTasks import imputation, remove_class, encoding, separating, classification_evaluation

# 1. Mushroom Dataset

In [2]:
# Loading the dataset
df = pd.read_csv("datasets/mushrooms.csv")

In [3]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [4]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(8124, 23)

In [5]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [6]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

e    4208
p    3916
Name: class, dtype: int64

In [7]:
# Encoding the attributes of the dataset
df = encoding(df)

In [8]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [9]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [10]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test)

In [12]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,1.0,1.0,1.0,1.0,1.0,1.0,0.010875
1,K-Nearest Neighbours,0.995077,0.995077,0.995077,0.995077,0.995255,0.990192,0.079036
2,Support Vector Machine,0.993231,0.993231,0.993231,0.993231,0.993106,0.986458,0.885465
3,Naive Bayes Classifier,0.966769,0.966769,0.966769,0.966769,0.965658,0.934813,0.009188
4,Logistic Regression,0.947692,0.947692,0.947692,0.947692,0.947734,0.895278,1.249333
5,Multi Layer Perceptron,1.0,1.0,1.0,1.0,1.0,1.0,4.41009
6,AdaBoost Classifier,1.0,1.0,1.0,1.0,1.0,1.0,0.236354
7,Random Forest,1.0,1.0,1.0,1.0,1.0,1.0,0.171392
8,Gradient Boosting,1.0,1.0,1.0,1.0,1.0,1.0,0.376684
9,Extra Trees,1.0,1.0,1.0,1.0,1.0,1.0,0.166146


In [13]:
for row in res[1]:
    print(row[0],':')
    print(row[1])

Decision Tree :
[[843   0]
 [  0 782]]
K-Nearest Neighbours :
[[835   8]
 [  0 782]]
Support Vector Machine :
[[840   3]
 [  8 774]]
Naive Bayes Classifier :
[[839   4]
 [ 50 732]]
Logistic Regression :
[[798  45]
 [ 40 742]]
Multi Layer Perceptron :
[[843   0]
 [  0 782]]
AdaBoost Classifier :
[[843   0]
 [  0 782]]
Random Forest :
[[843   0]
 [  0 782]]
Gradient Boosting :
[[843   0]
 [  0 782]]
Extra Trees :
[[843   0]
 [  0 782]]
XGBoost :
[[843   0]
 [  0 782]]
Custom Naive Bayes Classifier :
[[839   4]
 [ 50 732]]
Count-Based Classifier :
[[841   2]
 [169 613]]


# 2. Car Evaluation Dataset

In [14]:
# Loading the dataset
df = pd.read_csv("datasets/car.csv")

In [15]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [16]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(1728, 7)

In [17]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [18]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

unacc    1210
acc       384
good       69
vgood      65
Name: class, dtype: int64

In [19]:
# Encoding the attributes of the dataset
df = encoding(df)

In [20]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [21]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [22]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test)

In [24]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.968208,0.968208,0.968208,0.968208,0.950974,0.934317,0.005461
1,K-Nearest Neighbours,0.884393,0.884393,0.884393,0.884393,0.967548,0.747824,0.012026
2,Support Vector Machine,0.913295,0.913295,0.913295,0.913295,0.986475,0.812954,0.139382
3,Naive Bayes Classifier,0.815029,0.815029,0.815029,0.815029,0.971721,0.583869,0.005683
4,Logistic Regression,0.65896,0.65896,0.65896,0.65896,0.791844,0.120936,0.01653
5,Multi Layer Perceptron,0.982659,0.982659,0.982659,0.982659,0.99961,0.963789,1.353029
6,AdaBoost Classifier,0.780347,0.780347,0.780347,0.780347,0.917367,0.537097,0.045525
7,Random Forest,0.968208,0.968208,0.968208,0.968208,0.995943,0.935297,0.083808
8,Gradient Boosting,0.965318,0.965318,0.965318,0.965318,0.996318,0.928868,0.27441
9,Extra Trees,0.973988,0.973988,0.973988,0.973988,0.995729,0.945784,0.080704


In [25]:
for row in res[1]:
    print(row[0],':')
    print(row[1])

Decision Tree :
[[ 76   6   1   0]
 [  1  10   0   0]
 [  0   0 235   0]
 [  1   2   0  14]]
K-Nearest Neighbours :
[[ 60   5  17   1]
 [  7   4   0   0]
 [  0   0 235   0]
 [  5   2   3   7]]
Support Vector Machine :
[[ 63   0  20   0]
 [  5   5   0   1]
 [  2   0 233   0]
 [  2   0   0  15]]
Naive Bayes Classifier :
[[ 45   3  35   0]
 [  7   4   0   0]
 [  8   0 227   0]
 [ 11   0   0   6]]
Logistic Regression :
[[ 11   0  69   3]
 [  2   0   9   0]
 [ 17   0 217   1]
 [ 12   0   5   0]]
Multi Layer Perceptron :
[[ 79   2   2   0]
 [  0  11   0   0]
 [  0   0 235   0]
 [  1   1   0  15]]
AdaBoost Classifier :
[[ 30  11  40   2]
 [  0   9   0   2]
 [ 11   8 216   0]
 [  2   0   0  15]]
Random Forest :
[[ 73   9   1   0]
 [  0  11   0   0]
 [  0   0 235   0]
 [  1   0   0  16]]
Gradient Boosting :
[[ 74   7   2   0]
 [  0  11   0   0]
 [  0   0 235   0]
 [  1   2   0  14]]
Extra Trees :
[[ 76   3   3   1]
 [  0  11   0   0]
 [  1   0 234   0]
 [  1   0   0  16]]
XGBoost :
[[ 79   4   

# 3. Breast Cancer Dataset

In [26]:
# Loading the dataset
df = pd.read_csv("datasets/breast-cancer.csv")

In [27]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no


In [28]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(286, 10)

In [29]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [30]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

no-recurrence-events    201
recurrence-events        85
Name: class, dtype: int64

In [31]:
# Encoding the attributes of the dataset
df = encoding(df)

In [32]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [33]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [34]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [35]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test)

XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


In [36]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.568966,0.568966,0.568966,0.568966,0.466538,-0.093274,0.004245
1,K-Nearest Neighbours,0.637931,0.637931,0.637931,0.637931,0.530888,0.097489,0.005208
2,Support Vector Machine,0.706897,0.706897,0.706897,0.706897,0.605534,0.333086,0.008335
3,Naive Bayes Classifier,0.672414,0.672414,0.672414,0.672414,0.599099,0.235148,0.003902
4,Logistic Regression,0.672414,0.672414,0.672414,0.672414,0.557915,0.219687,0.004856
5,Multi Layer Perceptron,0.62069,0.62069,0.62069,0.62069,0.507079,0.024241,1.369144
6,AdaBoost Classifier,0.655172,0.655172,0.655172,0.655172,0.564994,0.172541,0.033741
7,Random Forest,0.655172,0.655172,0.655172,0.655172,0.564994,0.172541,0.10928
8,Gradient Boosting,0.689655,0.689655,0.689655,0.689655,0.581725,0.279879,0.050612
9,Extra Trees,0.689655,0.689655,0.689655,0.689655,0.612613,0.27611,0.07348


In [37]:
for row in res[1]:
    print(row[0],':')
    print(row[1])

Decision Tree :
[[31  6]
 [19  2]]
K-Nearest Neighbours :
[[34  3]
 [18  3]]
Support Vector Machine :
[[36  1]
 [16  5]]
Naive Bayes Classifier :
[[32  5]
 [14  7]]
Logistic Regression :
[[36  1]
 [18  3]]
Multi Layer Perceptron :
[[34  3]
 [19  2]]
AdaBoost Classifier :
[[33  4]
 [16  5]]
Random Forest :
[[33  4]
 [16  5]]
Gradient Boosting :
[[36  1]
 [17  4]]
Extra Trees :
[[33  4]
 [14  7]]
Custom Naive Bayes Classifier :
[[32  5]
 [14  7]]
Count-Based Classifier :
[[37  0]
 [21  0]]


# 4. Congressional Voting Records Dataset

In [38]:
# Loading the dataset
df = pd.read_csv("datasets/house-votes-84.csv")

In [39]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,class,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,synfuels-corporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
0,republican,n,y,n,y,y,y,n,n,n,y,?,y,y,y,n,y
1,republican,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,?
2,democrat,?,y,y,?,y,y,n,n,n,n,y,n,y,y,n,n
3,democrat,n,y,y,n,?,y,n,n,n,n,y,n,y,n,n,y
4,democrat,y,y,y,n,y,y,n,n,n,n,y,?,y,y,y,y


In [40]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(435, 17)

In [41]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [42]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

democrat      267
republican    168
Name: class, dtype: int64

In [43]:
# Encoding the attributes of the dataset
df = encoding(df)

In [44]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [45]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [46]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [47]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test)

XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


In [48]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.942529,0.942529,0.942529,0.942529,0.926555,0.874531,0.012869
1,K-Nearest Neighbours,0.931034,0.931034,0.931034,0.931034,0.924827,0.849654,0.023193
2,Support Vector Machine,0.977011,0.977011,0.977011,0.977011,0.974942,0.949885,0.015497
3,Naive Bayes Classifier,0.896552,0.896552,0.896552,0.896552,0.890841,0.776334,0.004192
4,Logistic Regression,0.988506,0.988506,0.988506,0.988506,0.983871,0.975071,0.013985
5,Multi Layer Perceptron,0.988506,0.988506,0.988506,0.988506,0.983871,0.975071,0.495615
6,AdaBoost Classifier,0.965517,0.965517,0.965517,0.965517,0.951613,0.925904,0.046754
7,Random Forest,0.954023,0.954023,0.954023,0.954023,0.942684,0.899469,0.088516
8,Gradient Boosting,0.965517,0.965517,0.965517,0.965517,0.951613,0.925904,0.049798
9,Extra Trees,0.977011,0.977011,0.977011,0.977011,0.974942,0.949885,0.074204


In [49]:
for row in res[1]:
    print(row[0],':')
    print(row[1])

Decision Tree :
[[55  1]
 [ 4 27]]
K-Nearest Neighbours :
[[53  3]
 [ 3 28]]
Support Vector Machine :
[[55  1]
 [ 1 30]]
Naive Bayes Classifier :
[[51  5]
 [ 4 27]]
Logistic Regression :
[[56  0]
 [ 1 30]]
Multi Layer Perceptron :
[[56  0]
 [ 1 30]]
AdaBoost Classifier :
[[56  0]
 [ 3 28]]
Random Forest :
[[55  1]
 [ 3 28]]
Gradient Boosting :
[[56  0]
 [ 3 28]]
Extra Trees :
[[55  1]
 [ 1 30]]
Custom Naive Bayes Classifier :
[[51  5]
 [ 4 27]]
Count-Based Classifier :
[[53  3]
 [ 6 25]]


# 5. Tic-Tac-Toe Endgame Dataset

In [50]:
# Loading the dataset
df = pd.read_csv("datasets/tic-tac-toe.csv")

In [51]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,top-left-square,top-middle-square,top-right-square,middle-left-square,middle-middle-square,middle-right-square,bottom-left-square,bottom-middle-square,bottom-right-square,class
0,x,x,x,x,o,o,x,o,o,positive
1,x,x,x,x,o,o,o,x,o,positive
2,x,x,x,x,o,o,o,o,x,positive
3,x,x,x,x,o,o,o,b,b,positive
4,x,x,x,x,o,o,b,o,b,positive


In [52]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(958, 10)

In [53]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [54]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

positive    626
negative    332
Name: class, dtype: int64

In [55]:
# Encoding the attributes of the dataset
df = encoding(df)

In [56]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [57]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [58]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [59]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test)

XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


In [60]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.875,0.875,0.875,0.875,0.855522,0.721812,0.005701
1,K-Nearest Neighbours,0.8125,0.8125,0.8125,0.8125,0.755582,0.575159,0.009054
2,Support Vector Machine,0.880208,0.880208,0.880208,0.880208,0.828358,0.744754,0.053723
3,Naive Bayes Classifier,0.729167,0.729167,0.729167,0.729167,0.663881,0.368795,0.004578
4,Logistic Regression,0.692708,0.692708,0.692708,0.692708,0.577015,0.251876,0.004735
5,Multi Layer Perceptron,0.916667,0.916667,0.916667,0.916667,0.887522,0.817104,2.403339
6,AdaBoost Classifier,0.776042,0.776042,0.776042,0.776042,0.703343,0.486519,0.065035
7,Random Forest,0.947917,0.947917,0.947917,0.947917,0.925373,0.88754,0.099322
8,Gradient Boosting,0.927083,0.927083,0.927083,0.927083,0.895522,0.843428,0.077155
9,Extra Trees,0.953125,0.953125,0.953125,0.953125,0.932836,0.898626,0.069156


In [61]:
for row in res[1]:
    print(row[0],':')
    print(row[1])

Decision Tree :
[[ 53  14]
 [ 10 115]]
K-Nearest Neighbours :
[[ 38  29]
 [  7 118]]
Support Vector Machine :
[[ 44  23]
 [  0 125]]
Naive Bayes Classifier :
[[ 30  37]
 [ 15 110]]
Logistic Regression :
[[ 13  54]
 [  5 120]]
Multi Layer Perceptron :
[[ 53  14]
 [  2 123]]
AdaBoost Classifier :
[[ 31  36]
 [  7 118]]
Random Forest :
[[ 57  10]
 [  0 125]]
Gradient Boosting :
[[ 53  14]
 [  0 125]]
Extra Trees :
[[ 58   9]
 [  0 125]]
Custom Naive Bayes Classifier :
[[ 30  37]
 [ 15 110]]
Count-Based Classifier :
[[  0  67]
 [  0 125]]


# 6. Nursery Dataset

In [62]:
# Loading the dataset
df = pd.read_csv("datasets/nursery.csv")

In [63]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,parents,has_nurs,form,children,housing,finance,social,health,class
0,usual,proper,complete,1,convenient,convenient,nonprob,recommended,recommend
1,usual,proper,complete,1,convenient,convenient,nonprob,priority,priority
2,usual,proper,complete,1,convenient,convenient,nonprob,not_recom,not_recom
3,usual,proper,complete,1,convenient,convenient,slightly_prob,recommended,recommend
4,usual,proper,complete,1,convenient,convenient,slightly_prob,priority,priority


In [64]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(12960, 9)

In [65]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [66]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

not_recom     4320
priority      4266
spec_prior    4044
very_recom     328
recommend        2
Name: class, dtype: int64

In [67]:
# Removing classes having <= 4 instances
df = remove_class(df,'class')

In [68]:
# Encoding the attributes of the dataset
df = encoding(df)

In [69]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [70]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [71]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [72]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test)

In [73]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.996142,0.996142,0.996142,0.996142,0.995982,0.994347,0.016551
1,K-Nearest Neighbours,0.939043,0.939043,0.939043,0.939043,0.985792,0.910301,0.111316
2,Support Vector Machine,0.951389,0.951389,0.951389,0.951389,0.996057,0.928532,2.639363
3,Naive Bayes Classifier,0.912809,0.912809,0.912809,0.912809,0.986501,0.871808,0.010985
4,Logistic Regression,0.769676,0.769676,0.769676,0.769676,0.9209,0.659953,0.371091
5,Multi Layer Perceptron,1.0,1.0,1.0,1.0,1.0,1.0,5.61485
6,AdaBoost Classifier,0.83912,0.83912,0.83912,0.83912,0.941626,0.773875,0.242084
7,Random Forest,0.983796,0.983796,0.983796,0.983796,0.999384,0.976261,0.335818
8,Gradient Boosting,0.980324,0.980324,0.980324,0.980324,0.999394,0.971168,1.690474
9,Extra Trees,0.97608,0.97608,0.97608,0.97608,0.998358,0.964912,0.399267


In [74]:
for row in res[1]:
    print(row[0],':')
    print(row[1])

Decision Tree :
[[842   0   0   0]
 [  0 851   7   0]
 [  0   2 827   0]
 [  0   1   0  62]]
K-Nearest Neighbours :
[[842   0   0   0]
 [  0 796  62   0]
 [  0  55 774   0]
 [  0  41   0  22]]
Support Vector Machine :
[[842   0   0   0]
 [  0 801  55   2]
 [  0  24 805   0]
 [  0  45   0  18]]
Naive Bayes Classifier :
[[842   0   0   0]
 [  0 783  75   0]
 [  0  93 736   0]
 [  0  58   0   5]]
Logistic Regression :
[[842   0   0   0]
 [  0 601 251   6]
 [  0 282 547   0]
 [  0  58   0   5]]
Multi Layer Perceptron :
[[842   0   0   0]
 [  0 858   0   0]
 [  0   0 829   0]
 [  0   0   0  63]]
AdaBoost Classifier :
[[842   0   0   0]
 [  0 766  48  44]
 [  0 292 537   0]
 [  0  33   0  30]]
Random Forest :
[[842   0   0   0]
 [  0 834  24   0]
 [  0   3 826   0]
 [  0  15   0  48]]
Gradient Boosting :
[[842   0   0   0]
 [  0 835  23   0]
 [  0   0 829   0]
 [  0  28   0  35]]
Extra Trees :
[[842   0   0   0]
 [  0 827  31   0]
 [  0   8 821   0]
 [  0  23   0  40]]
XGBoost :
[[842   0   

# 7. Soybean (Large) Dataset

In [75]:
# Loading the dataset
df = pd.read_csv("datasets/soybean-large.csv")

In [76]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,class,date,plant-stand,precip,temp,hail,crop-hist,area-damaged,severity,seed-tmt,...,int-discolor,sclerotia,fruit-pods,fruit spots,seed,mold-growth,seed-discolor,seed-size,shriveling,roots
0,diaporthe-stem-canker,6,0,2,1,0,1,1,1,0,...,0,0,0,4,0,0,0,0,0,0
1,diaporthe-stem-canker,4,0,2,1,0,2,0,2,1,...,0,0,0,4,0,0,0,0,0,0
2,diaporthe-stem-canker,3,0,2,1,0,1,0,2,1,...,0,0,0,4,0,0,0,0,0,0
3,diaporthe-stem-canker,3,0,2,1,0,1,0,2,0,...,0,0,0,4,0,0,0,0,0,0
4,diaporthe-stem-canker,6,0,2,1,0,2,0,1,0,...,0,0,0,4,0,0,0,0,0,0


In [77]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(307, 36)

In [78]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [79]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

frog-eye-leaf-spot             40
phytophthora-rot               40
alternarialeaf-spot            40
brown-spot                     40
brown-stem-rot                 20
anthracnose                    20
diaporthe-stem-canker          10
purple-seed-stain              10
phyllosticta-leaf-spot         10
bacterial-pustule              10
charcoal-rot                   10
bacterial-blight               10
downy-mildew                   10
powdery-mildew                 10
rhizoctonia-root-rot           10
diaporthe-pod-&-stem-blight     6
cyst-nematode                   6
herbicide-injury                4
2-4-d-injury                    1
Name: class, dtype: int64

In [80]:
# Removing classes having <= 4 instances
df = remove_class(df,'class')

In [81]:
# Encoding the attributes of the dataset
df = encoding(df)

In [82]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [83]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [84]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [85]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test)

In [86]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.918033,0.918033,0.918033,0.918033,0.98282,0.912628,0.024257
1,K-Nearest Neighbours,0.836066,0.836066,0.836066,0.836066,0.966384,0.823289,0.044177
2,Support Vector Machine,0.868852,0.868852,0.868852,0.868852,0.993773,0.857679,0.036677
3,Naive Bayes Classifier,0.885246,0.885246,0.885246,0.885246,0.994924,0.879115,0.013154
4,Logistic Regression,0.901639,0.901639,0.901639,0.901639,0.995721,0.8937,0.049396
5,Multi Layer Perceptron,0.885246,0.885246,0.885246,0.885246,0.991227,0.876385,0.781373
6,AdaBoost Classifier,0.311475,0.311475,0.311475,0.311475,0.723884,0.324577,0.057105
7,Random Forest,0.934426,0.934426,0.934426,0.934426,0.995692,0.931778,0.105342
8,Gradient Boosting,0.918033,0.918033,0.918033,0.918033,0.993225,0.914273,0.679794
9,Extra Trees,0.918033,0.918033,0.918033,0.918033,0.994345,0.912628,0.054732


In [87]:
for row in res[1]:
    print(row[0],':')
    print(row[1])

Decision Tree :
[[8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 7 0 0 0 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 0 5 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0]
 [4 0 0 0 0 0 0 0 0 0 0 7 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 7 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2]]
K-Nearest Neighbours :
[[8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 7 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 3 2 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0

# 8. Molecular Biology (Promoter Gene Sequences) Dataset

In [88]:
# Loading the dataset
df = pd.read_csv("datasets/promoters.csv")

In [89]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,class,instance_name,sequence
0,+,S10,\t\ttactagcaatacgcttgcgttcggtggttaagtatgtataat...
1,+,AMPC,\t\ttgctatcctgacagttgtcacgctgattggtgtcgttacaat...
2,+,AROH,\t\tgtactagagaactagtgcattagcttatttttttgttatcat...
3,+,DEOP2,\taattgtgatgtgtatcgaagtgtgttgcggagtagatgttagaa...
4,+,LEU1_TRNA,\ttcgataattaactattgacgaaaagctgaaaaccactagaatgc...


In [90]:
# Formatting columns to create a proper dataset
df['sequence'] = df['sequence'].apply(lambda x: x.replace('\t',''))
attributes = [str(i) for i in range(0,59)]
df[attributes] = df['sequence'].str.split(pat='', expand=True)
df.drop(['sequence','0','58'],axis=1,inplace=True)

In [91]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,class,instance_name,1,2,3,4,5,6,7,8,...,48,49,50,51,52,53,54,55,56,57
0,+,S10,t,a,c,t,a,g,c,a,...,g,g,c,t,t,g,t,c,g,t
1,+,AMPC,t,g,c,t,a,t,c,c,...,g,c,a,t,c,g,c,c,a,a
2,+,AROH,g,t,a,c,t,a,g,a,...,c,c,a,c,c,c,g,g,c,g
3,+,DEOP2,a,a,t,t,g,t,g,a,...,t,a,a,c,a,a,a,c,t,c
4,+,LEU1_TRNA,t,c,g,a,t,a,a,t,...,t,c,c,g,t,g,g,t,a,g


In [92]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(106, 59)

In [93]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [94]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

+    53
-    53
Name: class, dtype: int64

In [95]:
# Encoding the attributes of the dataset
df = encoding(df)

In [96]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [97]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [98]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [99]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test)

XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


In [100]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,1.0,1.0,1.0,1.0,1.0,1.0,0.004722
1,K-Nearest Neighbours,1.0,1.0,1.0,1.0,1.0,1.0,0.008793
2,Support Vector Machine,1.0,1.0,1.0,1.0,1.0,1.0,0.004794
3,Naive Bayes Classifier,0.909091,0.909091,0.909091,0.909091,0.909091,0.83205,0.006393
4,Logistic Regression,1.0,1.0,1.0,1.0,1.0,1.0,0.009433
5,Multi Layer Perceptron,0.909091,0.909091,0.909091,0.909091,0.909091,0.818182,0.189153
6,AdaBoost Classifier,1.0,1.0,1.0,1.0,1.0,1.0,0.005485
7,Random Forest,1.0,1.0,1.0,1.0,1.0,1.0,0.088568
8,Gradient Boosting,1.0,1.0,1.0,1.0,1.0,1.0,0.036266
9,Extra Trees,0.909091,0.909091,0.909091,0.909091,0.909091,0.83205,0.065385


In [101]:
for row in res[1]:
    print(row[0],':')
    print(row[1])

Decision Tree :
[[11  0]
 [ 0 11]]
K-Nearest Neighbours :
[[11  0]
 [ 0 11]]
Support Vector Machine :
[[11  0]
 [ 0 11]]
Naive Bayes Classifier :
[[11  0]
 [ 2  9]]
Logistic Regression :
[[11  0]
 [ 0 11]]
Multi Layer Perceptron :
[[10  1]
 [ 1 10]]
AdaBoost Classifier :
[[11  0]
 [ 0 11]]
Random Forest :
[[11  0]
 [ 0 11]]
Gradient Boosting :
[[11  0]
 [ 0 11]]
Extra Trees :
[[11  0]
 [ 2  9]]
Custom Naive Bayes Classifier :
[[11  0]
 [ 2  9]]
Count-Based Classifier :
[[11  0]
 [ 5  6]]


# 9. Balance Scale Dataset

In [102]:
# Loading the dataset
df = pd.read_csv("datasets/balance-scale.csv")

In [103]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,class,left-weight,left-distance,right-weight,right-distance
0,B,1,1,1,1
1,R,1,1,1,2
2,R,1,1,1,3
3,R,1,1,1,4
4,R,1,1,1,5


In [104]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(625, 5)

In [105]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [106]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

R    288
L    288
B     49
Name: class, dtype: int64

In [107]:
# Encoding the attributes of the dataset
df = encoding(df)

In [108]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [109]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [110]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [111]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test)

In [112]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.76,0.76,0.76,0.76,0.719144,0.593924,0.00749
1,K-Nearest Neighbours,0.816,0.816,0.816,0.816,0.820842,0.682363,0.006608
2,Support Vector Machine,0.904,0.904,0.904,0.904,0.952161,0.835157,0.016503
3,Naive Bayes Classifier,0.896,0.896,0.896,0.896,0.842315,0.820935,0.004521
4,Logistic Regression,0.84,0.84,0.84,0.84,0.954595,0.733634,0.006583
5,Multi Layer Perceptron,0.984,0.984,0.984,0.984,0.998139,0.97243,0.448223
6,AdaBoost Classifier,0.936,0.936,0.936,0.936,0.858374,0.899371,0.033565
7,Random Forest,0.792,0.792,0.792,0.792,0.816732,0.642333,0.067608
8,Gradient Boosting,0.888,0.888,0.888,0.888,0.903488,0.802793,0.108428
9,Extra Trees,0.808,0.808,0.808,0.808,0.771494,0.665108,0.056677


In [113]:
for row in res[1]:
    print(row[0],':')
    print(row[1])

Decision Tree :
[[ 0  8  3]
 [ 4 49  2]
 [10  3 46]]
K-Nearest Neighbours :
[[ 0  7  4]
 [ 1 54  0]
 [ 4  7 48]]
Support Vector Machine :
[[ 0  7  4]
 [ 0 55  0]
 [ 0  1 58]]
Naive Bayes Classifier :
[[ 0  7  4]
 [ 0 55  0]
 [ 0  2 57]]
Logistic Regression :
[[ 0 10  1]
 [ 0 55  0]
 [ 0  9 50]]
Multi Layer Perceptron :
[[ 9  2  0]
 [ 0 55  0]
 [ 0  0 59]]
AdaBoost Classifier :
[[11  0  0]
 [ 1 54  0]
 [ 7  0 52]]
Random Forest :
[[ 0  5  6]
 [ 3 52  0]
 [ 8  4 47]]
Gradient Boosting :
[[ 0  5  6]
 [ 0 55  0]
 [ 3  0 56]]
Extra Trees :
[[ 0  5  6]
 [ 1 53  1]
 [ 7  4 48]]
XGBoost :
[[ 1  6  4]
 [ 2 53  0]
 [ 7  2 50]]
Custom Naive Bayes Classifier :
[[ 0  7  4]
 [ 0 55  0]
 [ 0  2 57]]
Count-Based Classifier :
[[ 0  5  6]
 [ 0 55  0]
 [ 0  2 57]]


# 10. Lenses Dataset

In [114]:
# Loading the dataset
df = pd.read_csv("datasets/lenses.csv", delimiter="  ", engine="python")

In [115]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,class,age,spectacle_prescription,astigmatic,tear_production_rate
1,1,1,1,1,3.0
2,1,1,1,2,2.0
3,1,1,2,1,3.0
4,1,1,2,2,1.0
5,1,2,1,1,3.0


In [116]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(24, 5)

In [117]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [118]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

1    15
2     9
Name: class, dtype: int64

In [119]:
# Encoding the attributes of the dataset
df = encoding(df)

In [120]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [121]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [122]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [123]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test)

XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


In [124]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.4,0.4,0.4,0.4,0.25,-0.408248,0.005057
1,K-Nearest Neighbours,0.4,0.4,0.4,0.4,0.25,-0.408248,0.005299
2,Support Vector Machine,0.4,0.4,0.4,0.4,0.25,-0.408248,0.003675
3,Naive Bayes Classifier,0.4,0.4,0.4,0.4,0.25,-0.408248,0.003891
4,Logistic Regression,0.4,0.4,0.4,0.4,0.25,-0.408248,0.004494
5,Multi Layer Perceptron,0.4,0.4,0.4,0.4,0.25,-0.408248,0.091054
6,AdaBoost Classifier,0.4,0.4,0.4,0.4,0.25,-0.408248,0.027175
7,Random Forest,0.4,0.4,0.4,0.4,0.25,-0.408248,0.050498
8,Gradient Boosting,0.4,0.4,0.4,0.4,0.25,-0.408248,0.019171
9,Extra Trees,0.4,0.4,0.4,0.4,0.25,-0.408248,0.038438


In [125]:
for row in res[1]:
    print(row[0],':')
    print(row[1])

Decision Tree :
[[2 2]
 [1 0]]
K-Nearest Neighbours :
[[2 2]
 [1 0]]
Support Vector Machine :
[[2 2]
 [1 0]]
Naive Bayes Classifier :
[[2 2]
 [1 0]]
Logistic Regression :
[[2 2]
 [1 0]]
Multi Layer Perceptron :
[[2 2]
 [1 0]]
AdaBoost Classifier :
[[2 2]
 [1 0]]
Random Forest :
[[2 2]
 [1 0]]
Gradient Boosting :
[[2 2]
 [1 0]]
Extra Trees :
[[2 2]
 [1 0]]
Custom Naive Bayes Classifier :
[[2 2]
 [1 0]]
Count-Based Classifier :
[[2 2]
 [1 0]]


# 11. Molecular Biology (Splice-junction Gene Sequences) Dataset

In [126]:
# Loading the dataset
df = pd.read_csv("datasets/splice.csv")

In [127]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,class,instance_name,sequence
0,EI,ATRINS-DONOR-521,CCAGCTGCATCACAGGAGGCCAGCGAGCAGG...
1,EI,ATRINS-DONOR-905,AGACCCGCCGGGAGGCGGAGGACCTGCAGGG...
2,EI,BABAPOE-DONOR-30,GAGGTGAAGGACGTCCTTCCCCAGGAGCCGG...
3,EI,BABAPOE-DONOR-867,GGGCTGCGTTGCTGGTCACATTCCTGGCAGGT...
4,EI,BABAPOE-DONOR-2817,GCTCAGCCCCCAGGTCACCCAGGAACTGACGTG...


In [128]:
# Formatting columns to create a proper dataset
df['sequence'] = df['sequence'].apply(lambda x: x.replace(' ',''))
attributes = [str(i) for i in range(0,62)]
df[attributes] = df['sequence'].str.split(pat='', expand=True)
df.drop(['sequence','0','61'],axis=1,inplace=True)

In [129]:
# Dropping unwanted columns
df.drop(['instance_name'], axis=1, inplace=True)

In [130]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,class,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
0,EI,C,C,A,G,C,T,G,C,A,...,A,G,C,C,A,G,T,C,T,G
1,EI,A,G,A,C,C,C,G,C,C,...,G,T,G,C,C,C,C,C,G,C
2,EI,G,A,G,G,T,G,A,A,G,...,C,A,C,G,G,G,G,A,T,G
3,EI,G,G,G,C,T,G,C,G,T,...,G,G,T,T,T,T,C,C,C,C
4,EI,G,C,T,C,A,G,C,C,C,...,C,C,T,T,G,A,C,C,C,T


In [131]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(3190, 61)

In [132]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [133]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

N     1655
IE     768
EI     767
Name: class, dtype: int64

In [134]:
# Encoding the attributes of the dataset
df = encoding(df)

In [135]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [136]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [137]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [138]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test)

In [139]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.890282,0.890282,0.890282,0.890282,0.913155,0.825571,0.02438
1,K-Nearest Neighbours,0.678683,0.678683,0.678683,0.678683,0.856702,0.533637,0.027851
2,Support Vector Machine,0.871473,0.871473,0.871473,0.871473,0.958339,0.792894,1.282662
3,Naive Bayes Classifier,0.943574,0.943574,0.943574,0.943574,0.992427,0.909437,0.009169
4,Logistic Regression,0.832288,0.832288,0.832288,0.832288,0.945302,0.732947,0.188655
5,Multi Layer Perceptron,0.855799,0.855799,0.855799,0.855799,0.95905,0.771575,3.343957
6,AdaBoost Classifier,0.9279,0.9279,0.9279,0.9279,0.945732,0.885156,0.225739
7,Random Forest,0.95768,0.95768,0.95768,0.95768,0.993195,0.932404,0.245077
8,Gradient Boosting,0.973354,0.973354,0.973354,0.973354,0.995411,0.957614,1.925892
9,Extra Trees,0.959248,0.959248,0.959248,0.959248,0.993003,0.93446,0.249228


In [140]:
for row in res[1]:
    print(row[0],':')
    print(row[1])

Decision Tree :
[[166  11   2]
 [  9 114  17]
 [ 12  19 288]]
K-Nearest Neighbours :
[[162   4  13]
 [ 36  85  19]
 [ 93  40 186]]
Support Vector Machine :
[[160   3  16]
 [  2 112  26]
 [ 16  19 284]]
Naive Bayes Classifier :
[[165   6   8]
 [  3 130   7]
 [  5   7 307]]
Logistic Regression :
[[148   7  24]
 [  7 115  18]
 [ 22  29 268]]
Multi Layer Perceptron :
[[155   7  17]
 [  4 120  16]
 [ 22  26 271]]
AdaBoost Classifier :
[[172   5   2]
 [  6 123  11]
 [ 10  12 297]]
Random Forest :
[[170   3   6]
 [  4 134   2]
 [  3   9 307]]
Gradient Boosting :
[[175   2   2]
 [  2 137   1]
 [  3   7 309]]
Extra Trees :
[[172   1   6]
 [  1 130   9]
 [  3   6 310]]
XGBoost :
[[177   1   1]
 [  1 138   1]
 [  4   7 308]]
Custom Naive Bayes Classifier :
[[165   6   8]
 [  3 130   7]
 [  5   7 307]]
Count-Based Classifier :
[[  0   0 179]
 [  0   0 140]
 [  0   0 319]]


# 12. SPECT Heart Dataset

In [141]:
# Loading the dataset
df = pd.read_csv("datasets/SPECT.csv")

In [142]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,class,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22
0,1,0,0,0,1,0,0,0,1,1,...,1,1,0,0,0,0,0,0,0,0
1,1,0,0,1,1,0,0,0,1,1,...,1,1,0,0,0,0,0,0,0,1
2,1,1,0,1,0,1,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,1
4,1,0,0,0,0,0,0,0,1,0,...,1,0,1,1,0,0,0,0,0,0


In [143]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(267, 23)

In [144]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [145]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

1    212
0     55
Name: class, dtype: int64

In [146]:
# Encoding the attributes of the dataset
df = encoding(df)

In [147]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [148]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [149]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [150]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test)

XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


In [151]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.703704,0.703704,0.703704,0.703704,0.6,0.16641,0.004538
1,K-Nearest Neighbours,0.722222,0.722222,0.722222,0.722222,0.611111,0.188982,0.008572
2,Support Vector Machine,0.740741,0.740741,0.740741,0.740741,0.533333,0.066667,0.008448
3,Naive Bayes Classifier,0.777778,0.777778,0.777778,0.777778,0.777778,0.445789,0.004916
4,Logistic Regression,0.759259,0.759259,0.759259,0.759259,0.544444,0.09325,0.004654
5,Multi Layer Perceptron,0.814815,0.814815,0.814815,0.814815,0.622222,0.271213,1.261042
6,AdaBoost Classifier,0.759259,0.759259,0.759259,0.759259,0.588889,0.170561,0.046208
7,Random Forest,0.759259,0.759259,0.759259,0.759259,0.588889,0.170561,0.099403
8,Gradient Boosting,0.777778,0.777778,0.777778,0.777778,0.555556,0.123278,0.044947
9,Extra Trees,0.759259,0.759259,0.759259,0.759259,0.588889,0.170561,0.068253


In [152]:
for row in res[1]:
    print(row[0],':')
    print(row[1])

Decision Tree :
[[ 4  5]
 [11 34]]
K-Nearest Neighbours :
[[ 4  5]
 [10 35]]
Support Vector Machine :
[[ 2  7]
 [ 7 38]]
Naive Bayes Classifier :
[[ 7  2]
 [10 35]]
Logistic Regression :
[[ 2  7]
 [ 6 39]]
Multi Layer Perceptron :
[[ 3  6]
 [ 4 41]]
AdaBoost Classifier :
[[ 3  6]
 [ 7 38]]
Random Forest :
[[ 3  6]
 [ 7 38]]
Gradient Boosting :
[[ 2  7]
 [ 5 40]]
Extra Trees :
[[ 3  6]
 [ 7 38]]
Custom Naive Bayes Classifier :
[[ 7  2]
 [10 35]]
Count-Based Classifier :
[[ 0  9]
 [ 0 45]]


# 13. Primary Tumor Dataset

In [153]:
# Loading the dataset
df = pd.read_csv("datasets/primary-tumor.csv")

In [154]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,class,age,sex,histologic-type,degree-of-diffe,bone,bone-marrow,lung,pleura,peritoneum,liver,brain,skin,neck,supraclavicular,axillar,mediastinum,abdominal
0,1,1,1,?,3,2,2,1,2,2,2,2,2,2,2,2,2,2
1,1,1,1,?,3,2,2,2,2,2,1,2,2,2,1,2,1,2
2,1,1,2,2,3,1,2,2,2,2,2,2,2,2,2,2,1,2
3,1,1,2,?,3,1,2,1,1,2,2,2,2,2,2,2,1,2
4,1,1,2,?,3,1,2,1,1,2,2,2,2,2,2,2,1,2


In [155]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(339, 18)

In [156]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [157]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

1     84
5     39
18    29
11    28
14    24
22    24
2     20
12    16
7     14
4     14
17    10
3      9
13     7
8      6
19     6
10     2
15     2
20     2
6      1
16     1
21     1
Name: class, dtype: int64

In [158]:
# Removing classes having <= 4 instances
df = remove_class(df,'class')

In [159]:
# Encoding the attributes of the dataset
df = encoding(df)

In [160]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [161]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [162]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [163]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test)

In [164]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.484848,0.484848,0.484848,0.484848,0.613247,0.408555,0.019559
1,K-Nearest Neighbours,0.560606,0.560606,0.560606,0.560606,0.716423,0.494487,0.028138
2,Support Vector Machine,0.545455,0.545455,0.545455,0.545455,0.822359,0.473369,0.05593
3,Naive Bayes Classifier,0.560606,0.560606,0.560606,0.560606,0.845391,0.498691,0.015567
4,Logistic Regression,0.606061,0.606061,0.606061,0.606061,0.830541,0.546992,0.04224
5,Multi Layer Perceptron,0.469697,0.469697,0.469697,0.469697,0.813367,0.386702,1.980584
6,AdaBoost Classifier,0.30303,0.30303,0.30303,0.30303,0.734081,0.218968,0.060757
7,Random Forest,0.5,0.5,0.5,0.5,0.755562,0.428242,0.110217
8,Gradient Boosting,0.5,0.5,0.5,0.5,0.806549,0.424908,0.529428
9,Extra Trees,0.484848,0.484848,0.484848,0.484848,0.6934,0.407374,0.054634


In [165]:
for row in res[1]:
    print(row[0],':')
    print(row[1])

Decision Tree :
[[14  0  0  0  1  0  0  1  0  1  0  0  0  0  0]
 [ 0  4  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 1  0  0  0  0  0  0  0  0  0  0  1  0  0  0]
 [ 2  0  0  0  0  0  0  0  0  0  1  0  0  0  0]
 [ 1  0  0  0  3  1  0  1  1  0  0  0  0  0  1]
 [ 0  0  0  0  1  0  0  1  0  0  0  0  1  0  0]
 [ 0  0  0  0  0  0  0  0  0  1  0  0  0  0  0]
 [ 3  0  0  0  1  1  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  1  0  0  0  0  2  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  1  0]
 [ 0  0  0  2  0  1  0  0  0  0  1  0  0  1  0]
 [ 0  0  0  0  0  0  0  0  0  0  2  0  0  0  0]
 [ 0  0  0  0  1  0  0  0  0  0  0  0  5  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  1  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  5]]
K-Nearest Neighbours :
[[16  0  0  0  0  0  0  0  0  0  0  0  1  0  0]
 [ 0  4  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 1  0  0  0  0  0  0  0  0  0  1  0  0  0  0]
 [ 2  0  0  0  0  0  0  0  0  0  1  0  0  0  0]
 [ 0  0  1  0  3  1  0  0  1  0  1  0  0  0  1]


# 14. Chess (King-Rook vs. King-Pawn) Dataset

In [166]:
# Loading the dataset
df = pd.read_csv("datasets/kr-vs-kp.csv")

In [167]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,bkblk,bknwy,bkon8,bkona,bkspr,bkxbq,bkxcr,bkxwp,blxwp,bxqsq,...,spcop,stlmt,thrsk,wkcti,wkna8,wknck,wkovl,wkpos,wtoeg,class
0,f,f,f,f,f,f,f,f,f,f,...,f,f,f,f,f,f,t,t,n,won
1,f,f,f,f,t,f,f,f,f,f,...,f,f,f,f,f,f,t,t,n,won
2,f,f,f,f,t,f,t,f,f,f,...,f,f,f,f,f,f,t,t,n,won
3,f,f,f,f,f,f,f,f,t,f,...,f,f,f,f,f,f,t,t,n,won
4,f,f,f,f,f,f,f,f,f,f,...,f,f,f,f,f,f,t,t,n,won


In [168]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(3196, 37)

In [169]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [170]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

won      1669
nowin    1527
Name: class, dtype: int64

In [171]:
# Encoding the attributes of the dataset
df = encoding(df)

In [172]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [173]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [174]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [175]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test)

Naive Bayes Classifier : index 1 is out of bounds for axis 1 with size 1
XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


In [176]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.9875,0.9875,0.9875,0.9875,0.986667,0.975171,0.007006
1,K-Nearest Neighbours,0.946875,0.946875,0.946875,0.946875,0.946471,0.893309,0.028414
2,Support Vector Machine,0.957812,0.957812,0.957812,0.957812,0.959118,0.916552,0.466256
3,Logistic Regression,0.948438,0.948438,0.948438,0.948438,0.950098,0.898441,0.066778
4,Multi Layer Perceptron,0.990625,0.990625,0.990625,0.990625,0.990392,0.981188,2.22769
5,AdaBoost Classifier,0.95625,0.95625,0.95625,0.95625,0.956667,0.912421,0.112897
6,Random Forest,0.985938,0.985938,0.985938,0.985938,0.98598,0.971775,0.181536
7,Gradient Boosting,0.960938,0.960938,0.960938,0.960938,0.962451,0.923134,0.156915
8,Extra Trees,0.9875,0.9875,0.9875,0.9875,0.987451,0.974902,0.134329
9,Custom Naive Bayes Classifier,0.892188,0.892188,0.892188,0.892188,0.890294,0.783764,0.742711


In [177]:
for row in res[1]:
    print(row[0],':')
    print(row[1])

Decision Tree :
[[292   8]
 [  0 340]]
K-Nearest Neighbours :
[[282  18]
 [ 16 324]]
Support Vector Machine :
[[294   6]
 [ 21 319]]
Logistic Regression :
[[293   7]
 [ 26 314]]
Multi Layer Perceptron :
[[296   4]
 [  2 338]]
AdaBoost Classifier :
[[289  11]
 [ 17 323]]
Random Forest :
[[296   4]
 [  5 335]]
Gradient Boosting :
[[296   4]
 [ 21 319]]
Extra Trees :
[[296   4]
 [  4 336]]
Custom Naive Bayes Classifier :
[[258  42]
 [ 27 313]]
Count-Based Classifier :
[[  3 297]
 [  0 340]]


# 15. Lymphography Dataset

In [178]:
# Loading the dataset
df = pd.read_csv("datasets/lymphography.csv")

In [179]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,class,lymphatics,block of affere,bl. of lymph. c,bl. of lymph. s,by pass,extravasates,regeneration of,early uptake in,lym.nodes dimin,lym.nodes enlar,changes in lym.,defect in node,changes in node,changes in stru,special forms,dislocation of,exclusion of no,no. of nodes in
0,3,4,2,1,1,1,1,1,2,1,2,2,2,4,8,1,1,2,2
1,2,3,2,1,1,2,2,1,2,1,3,3,2,3,4,2,2,2,2
2,3,3,2,2,2,2,2,2,2,1,4,3,3,4,8,3,2,2,7
3,3,3,1,1,1,1,2,1,2,1,3,3,4,4,4,3,1,2,6
4,2,3,1,1,1,1,1,1,1,1,2,2,4,3,5,1,2,2,1


In [180]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(148, 19)

In [181]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [182]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

2    81
3    61
4     4
1     2
Name: class, dtype: int64

In [183]:
# Removing classes having <= 4 instances
df = remove_class(df,'class')

In [184]:
# Encoding the attributes of the dataset
df = encoding(df)

In [185]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [186]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [187]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [188]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test)

XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


In [189]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.827586,0.827586,0.827586,0.827586,0.825758,0.641863,0.004727
1,K-Nearest Neighbours,0.689655,0.689655,0.689655,0.689655,0.661616,0.329967,0.009004
2,Support Vector Machine,0.758621,0.758621,0.758621,0.758621,0.717172,0.471532,0.005284
3,Naive Bayes Classifier,0.862069,0.862069,0.862069,0.862069,0.853535,0.707071,0.004501
4,Logistic Regression,0.862069,0.862069,0.862069,0.862069,0.853535,0.707071,0.006182
5,Multi Layer Perceptron,0.862069,0.862069,0.862069,0.862069,0.853535,0.707071,1.955092
6,AdaBoost Classifier,0.862069,0.862069,0.862069,0.862069,0.871212,0.724358,0.084551
7,Random Forest,0.862069,0.862069,0.862069,0.862069,0.853535,0.707071,0.139666
8,Gradient Boosting,0.793103,0.793103,0.793103,0.793103,0.780303,0.560606,0.045841
9,Extra Trees,0.896552,0.896552,0.896552,0.896552,0.89899,0.786157,0.056691


In [190]:
for row in res[1]:
    print(row[0],':')
    print(row[1])

Decision Tree :
[[15  3]
 [ 2  9]]
K-Nearest Neighbours :
[[14  4]
 [ 5  6]]
Support Vector Machine :
[[16  2]
 [ 5  6]]
Naive Bayes Classifier :
[[16  2]
 [ 2  9]]
Logistic Regression :
[[16  2]
 [ 2  9]]
Multi Layer Perceptron :
[[16  2]
 [ 2  9]]
AdaBoost Classifier :
[[15  3]
 [ 1 10]]
Random Forest :
[[16  2]
 [ 2  9]]
Gradient Boosting :
[[15  3]
 [ 3  8]]
Extra Trees :
[[16  2]
 [ 1 10]]
Custom Naive Bayes Classifier :
[[16  2]
 [ 2  9]]
Count-Based Classifier :
[[18  0]
 [10  1]]


# 16. Connect-4 Dataset

In [191]:
# Loading the dataset
df = pd.read_csv("datasets/connect-4.csv")

In [192]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,a1,a2,a3,a4,a5,a6,b1,b2,b3,b4,...,f4,f5,f6,g1,g2,g3,g4,g5,g6,class
0,b,b,b,b,b,b,b,b,b,b,...,b,b,b,b,b,b,b,b,b,win
1,b,b,b,b,b,b,b,b,b,b,...,b,b,b,b,b,b,b,b,b,win
2,b,b,b,b,b,b,o,b,b,b,...,b,b,b,b,b,b,b,b,b,win
3,b,b,b,b,b,b,b,b,b,b,...,b,b,b,b,b,b,b,b,b,win
4,o,b,b,b,b,b,b,b,b,b,...,b,b,b,b,b,b,b,b,b,win


In [193]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(67557, 43)

In [194]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [195]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

win     44473
loss    16635
draw     6449
Name: class, dtype: int64

In [196]:
# Encoding the attributes of the dataset
df = encoding(df)

In [197]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [198]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [199]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [200]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test)

In [201]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.730314,0.730314,0.730314,0.730314,0.701619,0.458758,0.206715
1,K-Nearest Neighbours,0.734384,0.734384,0.734384,0.734384,0.769195,0.43617,1.828797
2,Support Vector Machine,0.774201,0.774201,0.774201,0.774201,0.870165,0.498554,844.36382
3,Naive Bayes Classifier,0.716326,0.716326,0.716326,0.716326,0.778189,0.352149,0.082777
4,Logistic Regression,0.658822,0.658822,0.658822,0.658822,0.623259,0.064677,2.219737
5,Multi Layer Perceptron,0.808245,0.808245,0.808245,0.808245,0.893569,0.600602,432.836763
6,AdaBoost Classifier,0.722247,0.722247,0.722247,0.722247,0.773903,0.354791,1.497519
7,Random Forest,0.809799,0.809799,0.809799,0.809799,0.887236,0.589625,3.378744
8,Gradient Boosting,0.75037,0.75037,0.75037,0.75037,0.836927,0.435244,12.183121
9,Extra Trees,0.814979,0.814979,0.814979,0.814979,0.891543,0.601872,4.137666


In [202]:
for row in res[1]:
    print(row[0],':')
    print(row[1])

Decision Tree :
[[ 334  387  574]
 [ 427 2069  845]
 [ 565  846 7465]]
K-Nearest Neighbours :
[[ 260  342  693]
 [ 339 1814 1188]
 [ 301  726 7849]]
Support Vector Machine :
[[   4  302  989]
 [   6 1941 1394]
 [   0  360 8516]]
Naive Bayes Classifier :
[[  88  168 1039]
 [ 151 1337 1853]
 [ 151  471 8254]]
Logistic Regression :
[[   0   26 1269]
 [   0   91 3250]
 [   0   65 8811]]
Multi Layer Perceptron :
[[ 213  469  613]
 [ 122 2599  620]
 [ 152  615 8109]]
AdaBoost Classifier :
[[   0  180 1115]
 [   0 1324 2017]
 [   0  441 8435]]
Random Forest :
[[ 172  320  803]
 [ 102 2275  964]
 [  71  310 8495]]
Gradient Boosting :
[[  10  220 1065]
 [   4 1663 1674]
 [   1  409 8466]]
Extra Trees :
[[ 195  313  787]
 [ 115 2293  933]
 [  82  270 8524]]
XGBoost :
[[ 214  366  715]
 [  81 2599  661]
 [  70  385 8421]]
Custom Naive Bayes Classifier :
[[  88  168 1039]
 [ 151 1337 1853]
 [ 151  471 8254]]
Count-Based Classifier :
[[   0    0 1295]
 [   0    0 3341]
 [   0    0 8876]]


# 17. Hayes-Roth Dataset

In [203]:
# Loading the dataset
df = pd.read_csv("datasets/hayes-roth.csv")

In [204]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,hobby,age,educational_level,marital_status,class
0,2,1,1,2,1
1,2,1,3,2,2
2,3,1,4,1,3
3,2,4,2,2,3
4,1,1,3,4,3


In [205]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(160, 5)

In [206]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [207]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

1    65
2    64
3    31
Name: class, dtype: int64

In [208]:
# Encoding the attributes of the dataset
df = encoding(df)

In [209]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [210]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [211]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [212]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test)

In [213]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.875,0.875,0.875,0.875,0.945703,0.797468,0.006301
1,K-Nearest Neighbours,0.6875,0.6875,0.6875,0.6875,0.772004,0.484761,0.006879
2,Support Vector Machine,0.8125,0.8125,0.8125,0.8125,0.878279,0.729003,0.008535
3,Naive Bayes Classifier,0.84375,0.84375,0.84375,0.84375,0.966276,0.767773,0.005435
4,Logistic Regression,0.40625,0.40625,0.40625,0.40625,0.689999,0.039171,0.006927
5,Multi Layer Perceptron,0.71875,0.71875,0.71875,0.71875,0.941152,0.551652,0.252024
6,AdaBoost Classifier,0.5,0.5,0.5,0.5,0.774621,0.486427,0.039358
7,Random Forest,0.875,0.875,0.875,0.875,0.956759,0.806313,0.086487
8,Gradient Boosting,0.84375,0.84375,0.84375,0.84375,0.976054,0.764506,0.101536
9,Extra Trees,0.875,0.875,0.875,0.875,0.935511,0.797468,0.089702


In [214]:
for row in res[1]:
    print(row[0],':')
    print(row[1])

Decision Tree :
[[14  2  0]
 [ 2  8  0]
 [ 0  0  6]]
K-Nearest Neighbours :
[[12  4  0]
 [ 3  7  0]
 [ 2  1  3]]
Support Vector Machine :
[[11  5  0]
 [ 0 10  0]
 [ 1  0  5]]
Naive Bayes Classifier :
[[12  4  0]
 [ 0 10  0]
 [ 1  0  5]]
Logistic Regression :
[[8 8 0]
 [6 3 1]
 [0 4 2]]
Multi Layer Perceptron :
[[11  5  0]
 [ 1  9  0]
 [ 3  0  3]]
AdaBoost Classifier :
[[ 0 16  0]
 [ 0 10  0]
 [ 0  0  6]]
Random Forest :
[[13  3  0]
 [ 1  9  0]
 [ 0  0  6]]
Gradient Boosting :
[[12  4  0]
 [ 1  9  0]
 [ 0  0  6]]
Extra Trees :
[[14  2  0]
 [ 2  8  0]
 [ 0  0  6]]
XGBoost :
[[12  4  0]
 [ 1  9  0]
 [ 1  0  5]]
Custom Naive Bayes Classifier :
[[12  4  0]
 [ 0 10  0]
 [ 1  0  5]]
Count-Based Classifier :
[[11  5  0]
 [ 0 10  0]
 [ 3  3  0]]


# 18. Lung Cancer Prediction Dataset

In [215]:
# Loading the dataset
df = pd.read_csv("datasets/cancer-patients.csv")

In [216]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,index,Patient Id,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,0,P1,33,1,2,4,5,4,3,2,...,3,4,2,2,3,1,2,3,4,Low
1,1,P10,17,1,3,1,5,3,4,2,...,1,3,7,8,6,2,1,7,2,Medium
2,2,P100,35,1,4,5,6,5,5,4,...,8,7,9,2,1,4,6,7,2,High
3,3,P1000,37,1,7,7,7,7,6,7,...,4,2,3,1,4,5,6,7,5,High
4,4,P101,46,1,6,8,7,7,7,6,...,3,2,4,1,4,2,4,2,3,High


In [217]:
# Dropping unwanted columns
df.drop(['index','Patient Id','Age'], axis=1, inplace=True)

In [218]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,Balanced Diet,Obesity,Smoking,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,1,2,4,5,4,3,2,2,4,3,...,3,4,2,2,3,1,2,3,4,Low
1,1,3,1,5,3,4,2,2,2,2,...,1,3,7,8,6,2,1,7,2,Medium
2,1,4,5,6,5,5,4,6,7,2,...,8,7,9,2,1,4,6,7,2,High
3,1,7,7,7,7,6,7,7,7,7,...,4,2,3,1,4,5,6,7,5,High
4,1,6,8,7,7,7,6,7,7,8,...,3,2,4,1,4,2,4,2,3,High


In [219]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(1000, 23)

In [220]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [221]:
# Checking the instance counts of the target attribute
df['Level'].value_counts()

High      365
Medium    332
Low       303
Name: Level, dtype: int64

In [222]:
# Encoding the attributes of the dataset
df = encoding(df)

In [223]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [224]:
# Separating dependent and independent variables
X, y = separating(df, 'Level')

In [225]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [226]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test)

In [227]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,1.0,1.0,1.0,1.0,1.0,1.0,0.010604
1,K-Nearest Neighbours,1.0,1.0,1.0,1.0,1.0,1.0,0.020374
2,Support Vector Machine,1.0,1.0,1.0,1.0,1.0,1.0,0.029698
3,Naive Bayes Classifier,1.0,1.0,1.0,1.0,1.0,1.0,0.011661
4,Logistic Regression,1.0,1.0,1.0,1.0,1.0,1.0,0.077698
5,Multi Layer Perceptron,1.0,1.0,1.0,1.0,1.0,1.0,2.074851
6,AdaBoost Classifier,0.725,0.725,0.725,0.725,0.870769,0.658113,0.104222
7,Random Forest,1.0,1.0,1.0,1.0,1.0,1.0,0.148199
8,Gradient Boosting,1.0,1.0,1.0,1.0,1.0,1.0,0.329705
9,Extra Trees,1.0,1.0,1.0,1.0,1.0,1.0,0.062242


In [228]:
for row in res[1]:
    print(row[0],':')
    print(row[1])

Decision Tree :
[[82  0  0]
 [ 0 55  0]
 [ 0  0 63]]
K-Nearest Neighbours :
[[82  0  0]
 [ 0 55  0]
 [ 0  0 63]]
Support Vector Machine :
[[82  0  0]
 [ 0 55  0]
 [ 0  0 63]]
Naive Bayes Classifier :
[[82  0  0]
 [ 0 55  0]
 [ 0  0 63]]
Logistic Regression :
[[82  0  0]
 [ 0 55  0]
 [ 0  0 63]]
Multi Layer Perceptron :
[[82  0  0]
 [ 0 55  0]
 [ 0  0 63]]
AdaBoost Classifier :
[[82  0  0]
 [ 0  0 55]
 [ 0  0 63]]
Random Forest :
[[82  0  0]
 [ 0 55  0]
 [ 0  0 63]]
Gradient Boosting :
[[82  0  0]
 [ 0 55  0]
 [ 0  0 63]]
Extra Trees :
[[82  0  0]
 [ 0 55  0]
 [ 0  0 63]]
XGBoost :
[[82  0  0]
 [ 0 55  0]
 [ 0  0 63]]
Custom Naive Bayes Classifier :
[[82  0  0]
 [ 0 55  0]
 [ 0  0 63]]
Count-Based Classifier :
[[82  0  0]
 [ 4 51  0]
 [ 9  1 53]]


# 19. Phishing Website Dataset

In [229]:
# Loading the dataset
df = pd.read_csv("datasets/phishing.csv")

In [230]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,Index,UsingIP,LongURL,ShortURL,Symbol@,Redirecting//,PrefixSuffix-,SubDomains,HTTPS,DomainRegLen,...,UsingPopupWindow,IframeRedirection,AgeofDomain,DNSRecording,WebsiteTraffic,PageRank,GoogleIndex,LinksPointingToPage,StatsReport,class
0,0,1,1,1,1,1,-1,0,1,-1,...,1,1,-1,-1,0,-1,1,1,1,-1
1,1,1,0,1,1,1,-1,-1,-1,-1,...,1,1,1,-1,1,-1,1,0,-1,-1
2,2,1,0,1,1,1,-1,-1,-1,1,...,1,1,-1,-1,1,-1,1,-1,1,-1
3,3,1,0,-1,1,1,-1,1,1,-1,...,-1,1,-1,-1,0,-1,1,1,1,1
4,4,-1,0,-1,1,-1,-1,1,1,-1,...,1,1,1,1,1,-1,1,-1,-1,1


In [231]:
# Dropping unwanted columns
df.drop(['Index'], axis=1, inplace=True)

In [232]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,UsingIP,LongURL,ShortURL,Symbol@,Redirecting//,PrefixSuffix-,SubDomains,HTTPS,DomainRegLen,Favicon,...,UsingPopupWindow,IframeRedirection,AgeofDomain,DNSRecording,WebsiteTraffic,PageRank,GoogleIndex,LinksPointingToPage,StatsReport,class
0,1,1,1,1,1,-1,0,1,-1,1,...,1,1,-1,-1,0,-1,1,1,1,-1
1,1,0,1,1,1,-1,-1,-1,-1,1,...,1,1,1,-1,1,-1,1,0,-1,-1
2,1,0,1,1,1,-1,-1,-1,1,1,...,1,1,-1,-1,1,-1,1,-1,1,-1
3,1,0,-1,1,1,-1,1,1,-1,1,...,-1,1,-1,-1,0,-1,1,1,1,1
4,-1,0,-1,1,-1,-1,1,1,-1,1,...,1,1,1,1,1,-1,1,-1,-1,1


In [233]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(11054, 31)

In [234]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [235]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

 1    6157
-1    4897
Name: class, dtype: int64

In [236]:
# Encoding the attributes of the dataset
df = encoding(df)

In [237]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [238]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [239]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [240]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test)

XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


In [241]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.95839,0.95839,0.95839,0.95839,0.958348,0.915751,0.02515
1,K-Nearest Neighbours,0.957485,0.957485,0.957485,0.957485,0.955712,0.91377,0.131551
2,Support Vector Machine,0.953867,0.953867,0.953867,0.953867,0.951184,0.906622,3.246224
3,Naive Bayes Classifier,0.93668,0.93668,0.93668,0.93668,0.93408,0.871534,0.014704
4,Logistic Regression,0.933062,0.933062,0.933062,0.933062,0.930412,0.864168,0.106913
5,Multi Layer Perceptron,0.959747,0.959747,0.959747,0.959747,0.959563,0.918454,68.476524
6,AdaBoost Classifier,0.938037,0.938037,0.938037,0.938037,0.93551,0.874286,0.480819
7,Random Forest,0.96834,0.96834,0.96834,0.96834,0.967362,0.93577,0.458085
8,Gradient Boosting,0.949344,0.949344,0.949344,0.949344,0.947672,0.897203,0.5779
9,Extra Trees,0.969697,0.969697,0.969697,0.969697,0.968792,0.938523,0.428


In [242]:
for row in res[1]:
    print(row[0],':')
    print(row[1])

Decision Tree :
[[ 935   41]
 [  51 1184]]
K-Nearest Neighbours :
[[ 918   58]
 [  36 1199]]
Support Vector Machine :
[[ 906   70]
 [  32 1203]]
Naive Bayes Classifier :
[[ 890   86]
 [  54 1181]]
Logistic Regression :
[[ 886   90]
 [  58 1177]]
Multi Layer Perceptron :
[[ 935   41]
 [  48 1187]]
AdaBoost Classifier :
[[ 892   84]
 [  53 1182]]
Random Forest :
[[ 936   40]
 [  30 1205]]
Gradient Boosting :
[[ 911   65]
 [  47 1188]]
Extra Trees :
[[ 938   38]
 [  29 1206]]
Custom Naive Bayes Classifier :
[[ 890   86]
 [  54 1181]]
Count-Based Classifier :
[[   3  973]
 [   0 1235]]


# 20. Monkey-Pox Patients Dataset

In [243]:
# Loading the dataset
df = pd.read_csv("datasets/monkey-pox.csv")

In [244]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,Patient_ID,Systemic Illness,Rectal Pain,Sore Throat,Penile Oedema,Oral Lesions,Solitary Lesion,Swollen Tonsils,HIV Infection,Sexually Transmitted Infection,MonkeyPox
0,P0,,False,True,True,True,False,True,False,False,Negative
1,P1,Fever,True,False,True,True,False,False,True,False,Positive
2,P2,Fever,False,True,True,False,False,False,True,False,Positive
3,P3,,True,False,False,False,True,True,True,False,Positive
4,P4,Swollen Lymph Nodes,True,True,True,False,False,True,True,False,Positive


In [245]:
# Dropping unwanted columns
df.drop(['Patient_ID'], axis=1, inplace=True)

In [246]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,Systemic Illness,Rectal Pain,Sore Throat,Penile Oedema,Oral Lesions,Solitary Lesion,Swollen Tonsils,HIV Infection,Sexually Transmitted Infection,MonkeyPox
0,,False,True,True,True,False,True,False,False,Negative
1,Fever,True,False,True,True,False,False,True,False,Positive
2,Fever,False,True,True,False,False,False,True,False,Positive
3,,True,False,False,False,True,True,True,False,Positive
4,Swollen Lymph Nodes,True,True,True,False,False,True,True,False,Positive


In [247]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(25000, 10)

In [248]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [249]:
# Checking the instance counts of the target attribute
df['MonkeyPox'].value_counts()

Positive    15909
Negative     9091
Name: MonkeyPox, dtype: int64

In [250]:
# Encoding the attributes of the dataset
df = encoding(df)

In [251]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [252]:
# Separating dependent and independent variables
X, y = separating(df, 'MonkeyPox')

In [253]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [254]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test)

XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


In [255]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.688,0.688,0.688,0.688,0.61885,0.268413,0.029781
1,K-Nearest Neighbours,0.6584,0.6584,0.6584,0.6584,0.601425,0.215884,0.289218
2,Support Vector Machine,0.7022,0.7022,0.7022,0.7022,0.612229,0.288582,51.590538
3,Naive Bayes Classifier,0.6976,0.6976,0.6976,0.6976,0.622968,0.286561,0.01716
4,Logistic Regression,0.6752,0.6752,0.6752,0.6752,0.579249,0.209111,0.037729
5,Multi Layer Perceptron,0.7032,0.7032,0.7032,0.7032,0.626102,0.298675,6.504503
6,AdaBoost Classifier,0.6972,0.6972,0.6972,0.6972,0.628032,0.290375,0.480158
7,Random Forest,0.6904,0.6904,0.6904,0.6904,0.618733,0.271638,0.688989
8,Gradient Boosting,0.7008,0.7008,0.7008,0.7008,0.627005,0.295164,0.664714
9,Extra Trees,0.688,0.688,0.688,0.688,0.61885,0.268413,0.745501


In [256]:
for row in res[1]:
    print(row[0],':')
    print(row[1])

Decision Tree :
[[ 679 1075]
 [ 485 2761]]
K-Nearest Neighbours :
[[ 720 1034]
 [ 674 2572]]
Support Vector Machine :
[[ 545 1209]
 [ 280 2966]]
Naive Bayes Classifier :
[[ 654 1100]
 [ 412 2834]]
Logistic Regression :
[[ 452 1302]
 [ 322 2924]]
Multi Layer Perceptron :
[[ 645 1109]
 [ 375 2871]]
AdaBoost Classifier :
[[ 695 1059]
 [ 455 2791]]
Random Forest :
[[ 664 1090]
 [ 458 2788]]
Gradient Boosting :
[[ 666 1088]
 [ 408 2838]]
Extra Trees :
[[ 679 1075]
 [ 485 2761]]
Custom Naive Bayes Classifier :
[[ 654 1100]
 [ 412 2834]]
Count-Based Classifier :
[[   0 1754]
 [   0 3246]]


# 21. Animal Condition Classification Dataset

In [257]:
# Loading the dataset
df = pd.read_csv("datasets/animal-condition.csv")

In [258]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,AnimalName,symptoms1,symptoms2,symptoms3,symptoms4,symptoms5,Dangerous
0,Dog,Fever,Diarrhea,Vomiting,Weight loss,Dehydration,Yes
1,Dog,Fever,Diarrhea,Coughing,Tiredness,Pains,Yes
2,Dog,Fever,Diarrhea,Coughing,Vomiting,Anorexia,Yes
3,Dog,Fever,Difficulty breathing,Coughing,Lethargy,Sneezing,Yes
4,Dog,Fever,Diarrhea,Coughing,Lethargy,Blue Eye,Yes


In [259]:
# Checking the unique values present in the dataset for the attribute 'AnimalName'
df['AnimalName'].value_counts()

Buffaloes            129
Sheep                110
Pig                   63
Fowl                  62
Elephant              59
Duck                  56
Deer                  38
Donkey                38
Birds                 37
cat                   36
Dog                   34
Monkey                28
Goat                  26
Cattle                21
Hamster               18
Tiger                 17
Lion                  16
Rabbit                11
Horse                 10
Chicken                9
Fox                    7
Other Birds            6
horse                  5
chicken                4
Turtle                 4
Pigs                   3
cow                    3
donkey                 2
Goats                  2
White-tailed deer      1
Hyaenas                1
Wolves                 1
Dogs                   1
Fox                    1
Moos                   1
Reindeer               1
mammal                 1
Sika deer              1
cattle                 1
Mule deer              1


In [260]:
# Fixing the values of the attribute 'AnimalName'
df['AnimalName'] = [name.lower() for name in df['AnimalName']]
df['AnimalName'].replace({'black-tailed deer':'deer','white-tailed deer':'deer','mule deer':'deer','sika deer':'deer','reindeer':'deer','elk':'deer','wapiti':'deer','mules':'horse','other birds': 'birds','pigs':'pig', 'dogs': 'dog', 'goats': 'goat'}, inplace = True)

In [261]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,AnimalName,symptoms1,symptoms2,symptoms3,symptoms4,symptoms5,Dangerous
0,dog,Fever,Diarrhea,Vomiting,Weight loss,Dehydration,Yes
1,dog,Fever,Diarrhea,Coughing,Tiredness,Pains,Yes
2,dog,Fever,Diarrhea,Coughing,Vomiting,Anorexia,Yes
3,dog,Fever,Difficulty breathing,Coughing,Lethargy,Sneezing,Yes
4,dog,Fever,Diarrhea,Coughing,Lethargy,Blue Eye,Yes


In [262]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(871, 7)

In [263]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [264]:
# Checking the instance counts of the target attribute
df['Dangerous'].value_counts()

Yes    849
No      20
Name: Dangerous, dtype: int64

In [265]:
# Encoding the attributes of the dataset
df = encoding(df)

In [266]:
# Imputing the missing values if any by replacing it with the mode
df = imputation(df)

In [267]:
# Separating dependent and independent variables
X, y = separating(df, 'Dangerous')

In [268]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [269]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test)

XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


In [270]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.971429,0.971429,0.971429,0.971429,0.738439,0.304254,0.005167
1,K-Nearest Neighbours,0.988571,0.988571,0.988571,0.988571,0.5,0.0,0.008359
2,Support Vector Machine,0.988571,0.988571,0.988571,0.988571,0.5,0.0,0.013727
3,Naive Bayes Classifier,0.971429,0.971429,0.971429,0.971429,0.738439,0.304254,0.004592
4,Logistic Regression,0.988571,0.988571,0.988571,0.988571,0.5,0.0,0.006801
5,Multi Layer Perceptron,0.988571,0.988571,0.988571,0.988571,0.74711,0.49422,0.574056
6,AdaBoost Classifier,0.994286,0.994286,0.994286,0.994286,0.75,0.705072,0.091916
7,Random Forest,0.988571,0.988571,0.988571,0.988571,0.5,0.0,0.159034
8,Gradient Boosting,0.988571,0.988571,0.988571,0.988571,0.5,0.0,0.123892
9,Extra Trees,0.994286,0.994286,0.994286,0.994286,0.75,0.705072,0.075929


In [271]:
for row in res[1]:
    print(row[0],':')
    print(row[1])

Decision Tree :
[[  1   1]
 [  4 169]]
K-Nearest Neighbours :
[[  0   2]
 [  0 173]]
Support Vector Machine :
[[  0   2]
 [  0 173]]
Naive Bayes Classifier :
[[  1   1]
 [  4 169]]
Logistic Regression :
[[  0   2]
 [  0 173]]
Multi Layer Perceptron :
[[  1   1]
 [  1 172]]
AdaBoost Classifier :
[[  1   1]
 [  0 173]]
Random Forest :
[[  0   2]
 [  0 173]]
Gradient Boosting :
[[  0   2]
 [  0 173]]
Extra Trees :
[[  1   1]
 [  0 173]]
Custom Naive Bayes Classifier :
[[  2   0]
 [163  10]]
Count-Based Classifier :
[[  0   2]
 [  0 173]]


# 22. Android Malware Detection

In [272]:
# Loading the dataset
df = pd.read_csv("datasets/TUANDROMD.csv")

In [273]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,ACCESS_ALL_DOWNLOADS,ACCESS_CACHE_FILESYSTEM,ACCESS_CHECKIN_PROPERTIES,ACCESS_COARSE_LOCATION,ACCESS_COARSE_UPDATES,ACCESS_FINE_LOCATION,ACCESS_LOCATION_EXTRA_COMMANDS,ACCESS_MOCK_LOCATION,ACCESS_MTK_MMHW,ACCESS_NETWORK_STATE,...,Landroid/telephony/TelephonyManager;->getLine1Number,Landroid/telephony/TelephonyManager;->getNetworkOperator,Landroid/telephony/TelephonyManager;->getNetworkOperatorName,Landroid/telephony/TelephonyManager;->getNetworkCountryIso,Landroid/telephony/TelephonyManager;->getSimOperator,Landroid/telephony/TelephonyManager;->getSimOperatorName,Landroid/telephony/TelephonyManager;->getSimCountryIso,Landroid/telephony/TelephonyManager;->getSimSerialNumber,Lorg/apache/http/impl/client/DefaultHttpClient;->execute,Label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,malware
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,malware
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,malware
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,malware
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,malware


In [274]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(4465, 242)

In [275]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [276]:
# Checking the instance counts of the target attribute
df['Label'].value_counts()

malware     3565
goodware     899
Name: Label, dtype: int64

In [277]:
# Encoding the attributes of the dataset
df = encoding(df)

In [278]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [279]:
# Separating dependent and independent variables
X, y = separating(df, 'Label')

In [280]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [281]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test)

Naive Bayes Classifier : index 1 is out of bounds for axis 1 with size 1
XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


In [282]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.993281,0.993281,0.993281,0.993281,0.989102,0.978204,0.029439
1,K-Nearest Neighbours,0.989922,0.989922,0.989922,0.989922,0.978029,0.967097,0.070623
2,Support Vector Machine,0.987682,0.987682,0.987682,0.987682,0.990143,0.961327,1.097147
3,Logistic Regression,0.985442,0.985442,0.985442,0.985442,0.984261,0.953805,0.037942
4,Multi Layer Perceptron,0.993281,0.993281,0.993281,0.993281,0.989102,0.978204,3.006236
5,AdaBoost Classifier,0.984323,0.984323,0.984323,0.984323,0.98132,0.950039,0.415181
6,Random Forest,0.995521,0.995521,0.995521,0.995521,0.994984,0.98556,0.197132
7,Gradient Boosting,0.985442,0.985442,0.985442,0.985442,0.984261,0.953805,0.518947
8,Extra Trees,0.995521,0.995521,0.995521,0.995521,0.994984,0.98556,0.184287
9,Custom Naive Bayes Classifier,0.973124,0.973124,0.973124,0.973124,0.967655,0.91531,6.674975


In [283]:
for row in res[1]:
    print(row[0],':')
    print(row[1])

Decision Tree :
[[167   3]
 [  3 720]]
K-Nearest Neighbours :
[[163   7]
 [  2 721]]
Support Vector Machine :
[[169   1]
 [ 10 713]]
Logistic Regression :
[[167   3]
 [ 10 713]]
Multi Layer Perceptron :
[[167   3]
 [  3 720]]
AdaBoost Classifier :
[[166   4]
 [ 10 713]]
Random Forest :
[[169   1]
 [  3 720]]
Gradient Boosting :
[[167   3]
 [ 10 713]]
Extra Trees :
[[169   1]
 [  3 720]]
Custom Naive Bayes Classifier :
[[163   7]
 [ 17 706]]
Count-Based Classifier :
[[  0 170]
 [  0 723]]
