# Experiment 1: Comparative Performance Analysis with the Original Count-Based Classifier

In [1]:
# Importing necessary libraries and packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from custom_functions.RepetitiveTasks import imputation, remove_class, encoding, separating, classification_evaluation
from CountEst.classifiers import CategoricalCBC
from custom_functions.OtherClassifiers import CountBasedClassifier

In [2]:
classifiers = {
                'Optimized Count-Based Classifier': CategoricalCBC(),
                'Original Count-Based Classifier': CountBasedClassifier(logic=1)
         }

# 1. Mushroom Dataset

In [3]:
# Loading the dataset
df = pd.read_csv("datasets/mushrooms.csv")

In [4]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [5]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(8124, 23)

In [6]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [7]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

e    4208
p    3916
Name: class, dtype: int64

In [8]:
# Encoding the attributes of the dataset
df = encoding(df)

In [9]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [10]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [11]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test, classifiers)

In [13]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Optimized Count-Based Classifier,0.894769,0.894769,0.894769,0.894769,0.890757,0.805112,2.093671
1,Original Count-Based Classifier,0.893538,0.893538,0.893538,0.893538,0.889386,0.803845,2.077995


# 2. Car Evaluation Dataset

In [14]:
# Loading the dataset
df = pd.read_csv("datasets/car.csv")

In [15]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [16]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(1728, 7)

In [17]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [18]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

unacc    1210
acc       384
good       69
vgood      65
Name: class, dtype: int64

In [19]:
# Encoding the attributes of the dataset
df = encoding(df)

In [20]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [21]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [22]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test, classifiers)

In [24]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Optimized Count-Based Classifier,0.679191,0.679191,0.679191,0.679191,0.955626,0.0,0.216751
1,Original Count-Based Classifier,0.679191,0.679191,0.679191,0.679191,0.5,0.0,0.254749


# 3. Breast Cancer Dataset

In [25]:
# Loading the dataset
df = pd.read_csv("datasets/breast-cancer.csv")

In [26]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no


In [27]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(286, 10)

In [28]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [29]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

no-recurrence-events    201
recurrence-events        85
Name: class, dtype: int64

In [30]:
# Encoding the attributes of the dataset
df = encoding(df)

In [31]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [32]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [33]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [34]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test, classifiers)

In [35]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Optimized Count-Based Classifier,0.637931,0.637931,0.637931,0.637931,0.5,0.0,0.038496
1,Original Count-Based Classifier,0.637931,0.637931,0.637931,0.637931,0.5,0.0,0.042885


# 4. Congressional Voting Records Dataset

In [36]:
# Loading the dataset
df = pd.read_csv("datasets/house-votes-84.csv")

In [37]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,class,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,synfuels-corporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
0,republican,n,y,n,y,y,y,n,n,n,y,?,y,y,y,n,y
1,republican,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,?
2,democrat,?,y,y,?,y,y,n,n,n,n,y,n,y,y,n,n
3,democrat,n,y,y,n,?,y,n,n,n,n,y,n,y,n,n,y
4,democrat,y,y,y,n,y,y,n,n,n,n,y,?,y,y,y,y


In [38]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(435, 17)

In [39]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [40]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

democrat      267
republican    168
Name: class, dtype: int64

In [41]:
# Encoding the attributes of the dataset
df = encoding(df)

In [42]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [43]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [44]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [45]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test, classifiers)

In [46]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Optimized Count-Based Classifier,0.896552,0.896552,0.896552,0.896552,0.87644,0.771784,0.119816
1,Original Count-Based Classifier,0.896552,0.896552,0.896552,0.896552,0.890841,0.776334,0.092965


# 5. Tic-Tac-Toe Endgame Dataset

In [47]:
# Loading the dataset
df = pd.read_csv("datasets/tic-tac-toe.csv")

In [48]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,top-left-square,top-middle-square,top-right-square,middle-left-square,middle-middle-square,middle-right-square,bottom-left-square,bottom-middle-square,bottom-right-square,class
0,x,x,x,x,o,o,x,o,o,positive
1,x,x,x,x,o,o,o,x,o,positive
2,x,x,x,x,o,o,o,o,x,positive
3,x,x,x,x,o,o,o,b,b,positive
4,x,x,x,x,o,o,b,o,b,positive


In [49]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(958, 10)

In [50]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [51]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

positive    626
negative    332
Name: class, dtype: int64

In [52]:
# Encoding the attributes of the dataset
df = encoding(df)

In [53]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [54]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [55]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [56]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test, classifiers)

In [57]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Optimized Count-Based Classifier,0.651042,0.651042,0.651042,0.651042,0.5,0.0,0.106394
1,Original Count-Based Classifier,0.651042,0.651042,0.651042,0.651042,0.5,0.0,0.117999


# 6. Nursery Dataset

In [58]:
# Loading the dataset
df = pd.read_csv("datasets/nursery.csv")

In [59]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,parents,has_nurs,form,children,housing,finance,social,health,class
0,usual,proper,complete,1,convenient,convenient,nonprob,recommended,recommend
1,usual,proper,complete,1,convenient,convenient,nonprob,priority,priority
2,usual,proper,complete,1,convenient,convenient,nonprob,not_recom,not_recom
3,usual,proper,complete,1,convenient,convenient,slightly_prob,recommended,recommend
4,usual,proper,complete,1,convenient,convenient,slightly_prob,priority,priority


In [60]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(12960, 9)

In [61]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [62]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

not_recom     4320
priority      4266
spec_prior    4044
very_recom     328
recommend        2
Name: class, dtype: int64

In [63]:
# Removing classes having <= 4 instances
df = remove_class(df,'class')

In [64]:
# Encoding the attributes of the dataset
df = encoding(df)

In [65]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [66]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [67]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [68]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test, classifiers)

In [69]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Optimized Count-Based Classifier,0.867284,0.867284,0.867284,0.867284,0.976814,0.813511,2.239698
1,Original Count-Based Classifier,0.595293,0.595293,0.595293,0.595293,0.675068,0.443682,2.148208


# 7. Soybean (Large) Dataset

In [70]:
# Loading the dataset
df = pd.read_csv("datasets/soybean-large.csv")

In [71]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,class,date,plant-stand,precip,temp,hail,crop-hist,area-damaged,severity,seed-tmt,...,int-discolor,sclerotia,fruit-pods,fruit spots,seed,mold-growth,seed-discolor,seed-size,shriveling,roots
0,diaporthe-stem-canker,6,0,2,1,0,1,1,1,0,...,0,0,0,4,0,0,0,0,0,0
1,diaporthe-stem-canker,4,0,2,1,0,2,0,2,1,...,0,0,0,4,0,0,0,0,0,0
2,diaporthe-stem-canker,3,0,2,1,0,1,0,2,1,...,0,0,0,4,0,0,0,0,0,0
3,diaporthe-stem-canker,3,0,2,1,0,1,0,2,0,...,0,0,0,4,0,0,0,0,0,0
4,diaporthe-stem-canker,6,0,2,1,0,2,0,1,0,...,0,0,0,4,0,0,0,0,0,0


In [72]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(307, 36)

In [73]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [74]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

frog-eye-leaf-spot             40
phytophthora-rot               40
alternarialeaf-spot            40
brown-spot                     40
brown-stem-rot                 20
anthracnose                    20
diaporthe-stem-canker          10
purple-seed-stain              10
phyllosticta-leaf-spot         10
bacterial-pustule              10
charcoal-rot                   10
bacterial-blight               10
downy-mildew                   10
powdery-mildew                 10
rhizoctonia-root-rot           10
diaporthe-pod-&-stem-blight     6
cyst-nematode                   6
herbicide-injury                4
2-4-d-injury                    1
Name: class, dtype: int64

In [75]:
# Removing classes having <= 4 instances
df = remove_class(df,'class')

In [76]:
# Encoding the attributes of the dataset
df = encoding(df)

In [77]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [78]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [79]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [80]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test, classifiers)

In [81]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Optimized Count-Based Classifier,0.393443,0.393443,0.393443,0.393443,0.989467,0.351931,0.855541
1,Original Count-Based Classifier,0.114754,0.114754,0.114754,0.114754,0.897485,-0.002241,0.872845


# 8. Molecular Biology (Promoter Gene Sequences) Dataset

In [82]:
# Loading the dataset
df = pd.read_csv("datasets/promoters.csv")

In [83]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,class,instance_name,sequence
0,+,S10,\t\ttactagcaatacgcttgcgttcggtggttaagtatgtataat...
1,+,AMPC,\t\ttgctatcctgacagttgtcacgctgattggtgtcgttacaat...
2,+,AROH,\t\tgtactagagaactagtgcattagcttatttttttgttatcat...
3,+,DEOP2,\taattgtgatgtgtatcgaagtgtgttgcggagtagatgttagaa...
4,+,LEU1_TRNA,\ttcgataattaactattgacgaaaagctgaaaaccactagaatgc...


In [84]:
# Formatting columns to create a proper dataset
df['sequence'] = df['sequence'].apply(lambda x: x.replace('\t',''))
attributes = [str(i) for i in range(0,59)]
df[attributes] = df['sequence'].str.split(pat='', expand=True)
df.drop(['sequence','0','58'],axis=1,inplace=True)

In [85]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,class,instance_name,1,2,3,4,5,6,7,8,...,48,49,50,51,52,53,54,55,56,57
0,+,S10,t,a,c,t,a,g,c,a,...,g,g,c,t,t,g,t,c,g,t
1,+,AMPC,t,g,c,t,a,t,c,c,...,g,c,a,t,c,g,c,c,a,a
2,+,AROH,g,t,a,c,t,a,g,a,...,c,c,a,c,c,c,g,g,c,g
3,+,DEOP2,a,a,t,t,g,t,g,a,...,t,a,a,c,a,a,a,c,t,c
4,+,LEU1_TRNA,t,c,g,a,t,a,a,t,...,t,c,c,g,t,g,g,t,a,g


In [86]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(106, 59)

In [87]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [88]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

+    53
-    53
Name: class, dtype: int64

In [89]:
# Encoding the attributes of the dataset
df = encoding(df)

In [90]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [91]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [92]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [93]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test, classifiers)

In [94]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Optimized Count-Based Classifier,0.772727,0.772727,0.772727,0.772727,0.772727,0.612372,0.111247
1,Original Count-Based Classifier,0.954545,0.954545,0.954545,0.954545,0.954545,0.912871,0.101765


# 9. Balance Scale Dataset

In [95]:
# Loading the dataset
df = pd.read_csv("datasets/balance-scale.csv")

In [96]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,class,left-weight,left-distance,right-weight,right-distance
0,B,1,1,1,1
1,R,1,1,1,2
2,R,1,1,1,3
3,R,1,1,1,4
4,R,1,1,1,5


In [97]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(625, 5)

In [98]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [99]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

R    288
L    288
B     49
Name: class, dtype: int64

In [100]:
# Encoding the attributes of the dataset
df = encoding(df)

In [101]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [102]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [103]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [104]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test, classifiers)

In [105]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Optimized Count-Based Classifier,0.896,0.896,0.896,0.896,0.718512,0.81977,0.058389
1,Original Count-Based Classifier,0.896,0.896,0.896,0.896,0.844423,0.81977,0.064809


# 10. Lenses Dataset

In [106]:
# Loading the dataset
df = pd.read_csv("datasets/lenses.csv", delimiter="  ", engine="python")

In [107]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,class,age,spectacle_prescription,astigmatic,tear_production_rate
1,1,1,1,1,3.0
2,1,1,1,2,2.0
3,1,1,2,1,3.0
4,1,1,2,2,1.0
5,1,2,1,1,3.0


In [108]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(24, 5)

In [109]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [110]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

1    15
2     9
Name: class, dtype: int64

In [111]:
# Encoding the attributes of the dataset
df = encoding(df)

In [112]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [113]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [114]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [115]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test, classifiers)

In [116]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Optimized Count-Based Classifier,0.4,0.4,0.4,0.4,0.25,-0.408248,0.014853
1,Original Count-Based Classifier,0.4,0.4,0.4,0.4,0.25,-0.408248,0.009696


# 11. Molecular Biology (Splice-junction Gene Sequences) Dataset

In [117]:
# Loading the dataset
df = pd.read_csv("datasets/splice.csv")

In [118]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,class,instance_name,sequence
0,EI,ATRINS-DONOR-521,CCAGCTGCATCACAGGAGGCCAGCGAGCAGG...
1,EI,ATRINS-DONOR-905,AGACCCGCCGGGAGGCGGAGGACCTGCAGGG...
2,EI,BABAPOE-DONOR-30,GAGGTGAAGGACGTCCTTCCCCAGGAGCCGG...
3,EI,BABAPOE-DONOR-867,GGGCTGCGTTGCTGGTCACATTCCTGGCAGGT...
4,EI,BABAPOE-DONOR-2817,GCTCAGCCCCCAGGTCACCCAGGAACTGACGTG...


In [119]:
# Formatting columns to create a proper dataset
df['sequence'] = df['sequence'].apply(lambda x: x.replace(' ',''))
attributes = [str(i) for i in range(0,62)]
df[attributes] = df['sequence'].str.split(pat='', expand=True)
df.drop(['sequence','0','61'],axis=1,inplace=True)

In [120]:
# Dropping unwanted columns
df.drop(['instance_name'], axis=1, inplace=True)

In [121]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,class,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
0,EI,C,C,A,G,C,T,G,C,A,...,A,G,C,C,A,G,T,C,T,G
1,EI,A,G,A,C,C,C,G,C,C,...,G,T,G,C,C,C,C,C,G,C
2,EI,G,A,G,G,T,G,A,A,G,...,C,A,C,G,G,G,G,A,T,G
3,EI,G,G,G,C,T,G,C,G,T,...,G,G,T,T,T,T,C,C,C,C
4,EI,G,C,T,C,A,G,C,C,C,...,C,C,T,T,G,A,C,C,C,T


In [122]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(3190, 61)

In [123]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [124]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

N     1655
IE     768
EI     767
Name: class, dtype: int64

In [125]:
# Encoding the attributes of the dataset
df = encoding(df)

In [126]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [127]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [128]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [129]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test, classifiers)

In [130]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Optimized Count-Based Classifier,0.5,0.5,0.5,0.5,0.980062,0.0,2.776871
1,Original Count-Based Classifier,0.5,0.5,0.5,0.5,0.94803,0.0,3.263744


# 12. SPECT Heart Dataset

In [131]:
# Loading the dataset
df = pd.read_csv("datasets/SPECT.csv")

In [132]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,class,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22
0,1,0,0,0,1,0,0,0,1,1,...,1,1,0,0,0,0,0,0,0,0
1,1,0,0,1,1,0,0,0,1,1,...,1,1,0,0,0,0,0,0,0,1
2,1,1,0,1,0,1,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,1
4,1,0,0,0,0,0,0,0,1,0,...,1,0,1,1,0,0,0,0,0,0


In [133]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(267, 23)

In [134]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [135]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

1    212
0     55
Name: class, dtype: int64

In [136]:
# Encoding the attributes of the dataset
df = encoding(df)

In [137]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [138]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [139]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [140]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test, classifiers)

In [141]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Optimized Count-Based Classifier,0.833333,0.833333,0.833333,0.833333,0.5,0.0,0.077848
1,Original Count-Based Classifier,0.833333,0.833333,0.833333,0.833333,0.5,0.0,0.089415


# 13. Primary Tumor Dataset

In [142]:
# Loading the dataset
df = pd.read_csv("datasets/primary-tumor.csv")

In [143]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,class,age,sex,histologic-type,degree-of-diffe,bone,bone-marrow,lung,pleura,peritoneum,liver,brain,skin,neck,supraclavicular,axillar,mediastinum,abdominal
0,1,1,1,?,3,2,2,1,2,2,2,2,2,2,2,2,2,2
1,1,1,1,?,3,2,2,2,2,2,1,2,2,2,1,2,1,2
2,1,1,2,2,3,1,2,2,2,2,2,2,2,2,2,2,1,2
3,1,1,2,?,3,1,2,1,1,2,2,2,2,2,2,2,1,2
4,1,1,2,?,3,1,2,1,1,2,2,2,2,2,2,2,1,2


In [144]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(339, 18)

In [145]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [146]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

1     84
5     39
18    29
11    28
14    24
22    24
2     20
12    16
7     14
4     14
17    10
3      9
13     7
8      6
19     6
10     2
15     2
20     2
6      1
16     1
21     1
Name: class, dtype: int64

In [147]:
# Removing classes having <= 4 instances
df = remove_class(df,'class')

In [148]:
# Encoding the attributes of the dataset
df = encoding(df)

In [149]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [150]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [151]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [152]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test, classifiers)

In [153]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Optimized Count-Based Classifier,0.257576,0.257576,0.257576,0.257576,0.827942,0.0,0.512437
1,Original Count-Based Classifier,0.257576,0.257576,0.257576,0.257576,0.621254,0.0,0.410241


# 14. Chess (King-Rook vs. King-Pawn) Dataset

In [154]:
# Loading the dataset
df = pd.read_csv("datasets/kr-vs-kp.csv")

In [155]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,bkblk,bknwy,bkon8,bkona,bkspr,bkxbq,bkxcr,bkxwp,blxwp,bxqsq,...,spcop,stlmt,thrsk,wkcti,wkna8,wknck,wkovl,wkpos,wtoeg,class
0,f,f,f,f,f,f,f,f,f,f,...,f,f,f,f,f,f,t,t,n,won
1,f,f,f,f,t,f,f,f,f,f,...,f,f,f,f,f,f,t,t,n,won
2,f,f,f,f,t,f,t,f,f,f,...,f,f,f,f,f,f,t,t,n,won
3,f,f,f,f,f,f,f,f,t,f,...,f,f,f,f,f,f,t,t,n,won
4,f,f,f,f,f,f,f,f,f,f,...,f,f,f,f,f,f,t,t,n,won


In [156]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(3196, 37)

In [157]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [158]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

won      1669
nowin    1527
Name: class, dtype: int64

In [159]:
# Encoding the attributes of the dataset
df = encoding(df)

In [160]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [161]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [162]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [163]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test, classifiers)

In [164]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Optimized Count-Based Classifier,0.535937,0.535937,0.535937,0.535937,0.505,0.073058,1.259722
1,Original Count-Based Classifier,0.53125,0.53125,0.53125,0.53125,0.5,0.0,1.314776


# 15. Lymphography Dataset

In [165]:
# Loading the dataset
df = pd.read_csv("datasets/lymphography.csv")

In [166]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,class,lymphatics,block of affere,bl. of lymph. c,bl. of lymph. s,by pass,extravasates,regeneration of,early uptake in,lym.nodes dimin,lym.nodes enlar,changes in lym.,defect in node,changes in node,changes in stru,special forms,dislocation of,exclusion of no,no. of nodes in
0,3,4,2,1,1,1,1,1,2,1,2,2,2,4,8,1,1,2,2
1,2,3,2,1,1,2,2,1,2,1,3,3,2,3,4,2,2,2,2
2,3,3,2,2,2,2,2,2,2,1,4,3,3,4,8,3,2,2,7
3,3,3,1,1,1,1,2,1,2,1,3,3,4,4,4,3,1,2,6
4,2,3,1,1,1,1,1,1,1,1,2,2,4,3,5,1,2,2,1


In [167]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(148, 19)

In [168]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [169]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

2    81
3    61
4     4
1     2
Name: class, dtype: int64

In [170]:
# Removing classes having <= 4 instances
df = remove_class(df,'class')

In [171]:
# Encoding the attributes of the dataset
df = encoding(df)

In [172]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [173]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [174]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [175]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test, classifiers)

In [176]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Optimized Count-Based Classifier,0.655172,0.655172,0.655172,0.655172,0.545455,0.241747,0.050319
1,Original Count-Based Classifier,0.758621,0.758621,0.758621,0.758621,0.681818,0.511682,0.036472


# 16. Connect-4 Dataset

In [177]:
# Loading the dataset
df = pd.read_csv("datasets/connect-4.csv")

In [178]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,a1,a2,a3,a4,a5,a6,b1,b2,b3,b4,...,f4,f5,f6,g1,g2,g3,g4,g5,g6,class
0,b,b,b,b,b,b,b,b,b,b,...,b,b,b,b,b,b,b,b,b,win
1,b,b,b,b,b,b,b,b,b,b,...,b,b,b,b,b,b,b,b,b,win
2,b,b,b,b,b,b,o,b,b,b,...,b,b,b,b,b,b,b,b,b,win
3,b,b,b,b,b,b,b,b,b,b,...,b,b,b,b,b,b,b,b,b,win
4,o,b,b,b,b,b,b,b,b,b,...,b,b,b,b,b,b,b,b,b,win


In [179]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(67557, 43)

In [180]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [181]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

win     44473
loss    16635
draw     6449
Name: class, dtype: int64

In [182]:
# Encoding the attributes of the dataset
df = encoding(df)

In [183]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [184]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [185]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [186]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test, classifiers)

In [187]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Optimized Count-Based Classifier,0.656898,0.656898,0.656898,0.656898,0.727292,0.0,44.665654
1,Original Count-Based Classifier,0.656898,0.656898,0.656898,0.656898,0.527483,0.0,40.834735


# 17. Hayes-Roth Dataset

In [188]:
# Loading the dataset
df = pd.read_csv("datasets/hayes-roth.csv")

In [189]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,hobby,age,educational_level,marital_status,class
0,2,1,1,2,1
1,2,1,3,2,2
2,3,1,4,1,3
3,2,4,2,2,3
4,1,1,3,4,3


In [190]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(160, 5)

In [191]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [192]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

1    65
2    64
3    31
Name: class, dtype: int64

In [193]:
# Encoding the attributes of the dataset
df = encoding(df)

In [194]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [195]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [196]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [197]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test, classifiers)

In [198]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Optimized Count-Based Classifier,0.65625,0.65625,0.65625,0.65625,0.930267,0.474855,0.024299
1,Original Count-Based Classifier,0.6875,0.6875,0.6875,0.6875,0.848662,0.509164,0.017711


# 18. Lung Cancer Prediction Dataset

In [199]:
# Loading the dataset
df = pd.read_csv("datasets/cancer-patients.csv")

In [200]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,index,Patient Id,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,0,P1,33,1,2,4,5,4,3,2,...,3,4,2,2,3,1,2,3,4,Low
1,1,P10,17,1,3,1,5,3,4,2,...,1,3,7,8,6,2,1,7,2,Medium
2,2,P100,35,1,4,5,6,5,5,4,...,8,7,9,2,1,4,6,7,2,High
3,3,P1000,37,1,7,7,7,7,6,7,...,4,2,3,1,4,5,6,7,5,High
4,4,P101,46,1,6,8,7,7,7,6,...,3,2,4,1,4,2,4,2,3,High


In [201]:
# Dropping unwanted columns
df.drop(['index','Patient Id','Age'], axis=1, inplace=True)

In [202]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,Balanced Diet,Obesity,Smoking,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,1,2,4,5,4,3,2,2,4,3,...,3,4,2,2,3,1,2,3,4,Low
1,1,3,1,5,3,4,2,2,2,2,...,1,3,7,8,6,2,1,7,2,Medium
2,1,4,5,6,5,5,4,6,7,2,...,8,7,9,2,1,4,6,7,2,High
3,1,7,7,7,7,6,7,7,7,7,...,4,2,3,1,4,5,6,7,5,High
4,1,6,8,7,7,7,6,7,7,8,...,3,2,4,1,4,2,4,2,3,High


In [203]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(1000, 23)

In [204]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [205]:
# Checking the instance counts of the target attribute
df['Level'].value_counts()

High      365
Medium    332
Low       303
Name: Level, dtype: int64

In [206]:
# Encoding the attributes of the dataset
df = encoding(df)

In [207]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [208]:
# Separating dependent and independent variables
X, y = separating(df, 'Level')

In [209]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [210]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test, classifiers)

In [211]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Optimized Count-Based Classifier,0.93,0.93,0.93,0.93,0.985888,0.89726,0.35223
1,Original Count-Based Classifier,0.965,0.965,0.965,0.965,0.996971,0.947037,0.386735


# 19. Phishing Website Dataset

In [212]:
# Loading the dataset
df = pd.read_csv("datasets/phishing.csv")

In [213]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,Index,UsingIP,LongURL,ShortURL,Symbol@,Redirecting//,PrefixSuffix-,SubDomains,HTTPS,DomainRegLen,...,UsingPopupWindow,IframeRedirection,AgeofDomain,DNSRecording,WebsiteTraffic,PageRank,GoogleIndex,LinksPointingToPage,StatsReport,class
0,0,1,1,1,1,1,-1,0,1,-1,...,1,1,-1,-1,0,-1,1,1,1,-1
1,1,1,0,1,1,1,-1,-1,-1,-1,...,1,1,1,-1,1,-1,1,0,-1,-1
2,2,1,0,1,1,1,-1,-1,-1,1,...,1,1,-1,-1,1,-1,1,-1,1,-1
3,3,1,0,-1,1,1,-1,1,1,-1,...,-1,1,-1,-1,0,-1,1,1,1,1
4,4,-1,0,-1,1,-1,-1,1,1,-1,...,1,1,1,1,1,-1,1,-1,-1,1


In [214]:
# Dropping unwanted columns
df.drop(['Index'], axis=1, inplace=True)

In [215]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,UsingIP,LongURL,ShortURL,Symbol@,Redirecting//,PrefixSuffix-,SubDomains,HTTPS,DomainRegLen,Favicon,...,UsingPopupWindow,IframeRedirection,AgeofDomain,DNSRecording,WebsiteTraffic,PageRank,GoogleIndex,LinksPointingToPage,StatsReport,class
0,1,1,1,1,1,-1,0,1,-1,1,...,1,1,-1,-1,0,-1,1,1,1,-1
1,1,0,1,1,1,-1,-1,-1,-1,1,...,1,1,1,-1,1,-1,1,0,-1,-1
2,1,0,1,1,1,-1,-1,-1,1,1,...,1,1,-1,-1,1,-1,1,-1,1,-1
3,1,0,-1,1,1,-1,1,1,-1,1,...,-1,1,-1,-1,0,-1,1,1,1,1
4,-1,0,-1,1,-1,-1,1,1,-1,1,...,1,1,1,1,1,-1,1,-1,-1,1


In [216]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(11054, 31)

In [217]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [218]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

 1    6157
-1    4897
Name: class, dtype: int64

In [219]:
# Encoding the attributes of the dataset
df = encoding(df)

In [220]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [221]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [222]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [223]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test, classifiers)

In [224]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Optimized Count-Based Classifier,0.559928,0.559928,0.559928,0.559928,0.501537,0.041464,3.219955
1,Original Count-Based Classifier,0.558571,0.558571,0.558571,0.558571,0.5,0.0,3.295801


# 20. Monkey-Pox Patients Dataset

In [225]:
# Loading the dataset
df = pd.read_csv("datasets/monkey-pox.csv")

In [226]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,Patient_ID,Systemic Illness,Rectal Pain,Sore Throat,Penile Oedema,Oral Lesions,Solitary Lesion,Swollen Tonsils,HIV Infection,Sexually Transmitted Infection,MonkeyPox
0,P0,,False,True,True,True,False,True,False,False,Negative
1,P1,Fever,True,False,True,True,False,False,True,False,Positive
2,P2,Fever,False,True,True,False,False,False,True,False,Positive
3,P3,,True,False,False,False,True,True,True,False,Positive
4,P4,Swollen Lymph Nodes,True,True,True,False,False,True,True,False,Positive


In [227]:
# Dropping unwanted columns
df.drop(['Patient_ID'], axis=1, inplace=True)

In [228]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,Systemic Illness,Rectal Pain,Sore Throat,Penile Oedema,Oral Lesions,Solitary Lesion,Swollen Tonsils,HIV Infection,Sexually Transmitted Infection,MonkeyPox
0,,False,True,True,True,False,True,False,False,Negative
1,Fever,True,False,True,True,False,False,True,False,Positive
2,Fever,False,True,True,False,False,False,True,False,Positive
3,,True,False,False,False,True,True,True,False,Positive
4,Swollen Lymph Nodes,True,True,True,False,False,True,True,False,Positive


In [229]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(25000, 10)

In [230]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [231]:
# Checking the instance counts of the target attribute
df['MonkeyPox'].value_counts()

Positive    15909
Negative     9091
Name: MonkeyPox, dtype: int64

In [232]:
# Encoding the attributes of the dataset
df = encoding(df)

In [233]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [234]:
# Separating dependent and independent variables
X, y = separating(df, 'MonkeyPox')

In [235]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [236]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test, classifiers)

In [237]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Optimized Count-Based Classifier,0.6492,0.6492,0.6492,0.6492,0.5,0.0,2.461443
1,Original Count-Based Classifier,0.6492,0.6492,0.6492,0.6492,0.5,0.0,2.412771


# 21. Animal Condition Classification Dataset

In [238]:
# Loading the dataset
df = pd.read_csv("datasets/animal-condition.csv")

In [239]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,AnimalName,symptoms1,symptoms2,symptoms3,symptoms4,symptoms5,Dangerous
0,Dog,Fever,Diarrhea,Vomiting,Weight loss,Dehydration,Yes
1,Dog,Fever,Diarrhea,Coughing,Tiredness,Pains,Yes
2,Dog,Fever,Diarrhea,Coughing,Vomiting,Anorexia,Yes
3,Dog,Fever,Difficulty breathing,Coughing,Lethargy,Sneezing,Yes
4,Dog,Fever,Diarrhea,Coughing,Lethargy,Blue Eye,Yes


In [240]:
# Checking the unique values present in the dataset for the attribute 'AnimalName'
df['AnimalName'].value_counts()

Buffaloes            129
Sheep                110
Pig                   63
Fowl                  62
Elephant              59
Duck                  56
Deer                  38
Donkey                38
Birds                 37
cat                   36
Dog                   34
Monkey                28
Goat                  26
Cattle                21
Hamster               18
Tiger                 17
Lion                  16
Rabbit                11
Horse                 10
Chicken                9
Fox                    7
Other Birds            6
horse                  5
chicken                4
Turtle                 4
Pigs                   3
cow                    3
donkey                 2
Goats                  2
White-tailed deer      1
Hyaenas                1
Wolves                 1
Dogs                   1
Fox                    1
Moos                   1
Reindeer               1
mammal                 1
Sika deer              1
cattle                 1
Mule deer              1


In [241]:
# Fixing the values of the attribute 'AnimalName'
df['AnimalName'] = [name.lower() for name in df['AnimalName']]
df['AnimalName'].replace({'black-tailed deer':'deer','white-tailed deer':'deer','mule deer':'deer','sika deer':'deer','reindeer':'deer','elk':'deer','wapiti':'deer','mules':'horse','other birds': 'birds','pigs':'pig', 'dogs': 'dog', 'goats': 'goat'}, inplace = True)

In [242]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,AnimalName,symptoms1,symptoms2,symptoms3,symptoms4,symptoms5,Dangerous
0,dog,Fever,Diarrhea,Vomiting,Weight loss,Dehydration,Yes
1,dog,Fever,Diarrhea,Coughing,Tiredness,Pains,Yes
2,dog,Fever,Diarrhea,Coughing,Vomiting,Anorexia,Yes
3,dog,Fever,Difficulty breathing,Coughing,Lethargy,Sneezing,Yes
4,dog,Fever,Diarrhea,Coughing,Lethargy,Blue Eye,Yes


In [243]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(871, 7)

In [244]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [245]:
# Checking the instance counts of the target attribute
df['Dangerous'].value_counts()

Yes    849
No      20
Name: Dangerous, dtype: int64

In [246]:
# Encoding the attributes of the dataset
df = encoding(df)

In [247]:
# Imputing the missing values if any by replacing it with the mode
df = imputation(df)

In [248]:
# Separating dependent and independent variables
X, y = separating(df, 'Dangerous')

In [249]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [250]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test, classifiers)

In [251]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Optimized Count-Based Classifier,0.988571,0.988571,0.988571,0.988571,0.5,0.0,0.058162
1,Original Count-Based Classifier,0.994286,0.994286,0.994286,0.994286,0.75,0.705072,0.081757


# 22. Android Malware Detection

In [252]:
# Loading the dataset
df = pd.read_csv("datasets/TUANDROMD.csv")

In [253]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,ACCESS_ALL_DOWNLOADS,ACCESS_CACHE_FILESYSTEM,ACCESS_CHECKIN_PROPERTIES,ACCESS_COARSE_LOCATION,ACCESS_COARSE_UPDATES,ACCESS_FINE_LOCATION,ACCESS_LOCATION_EXTRA_COMMANDS,ACCESS_MOCK_LOCATION,ACCESS_MTK_MMHW,ACCESS_NETWORK_STATE,...,Landroid/telephony/TelephonyManager;->getLine1Number,Landroid/telephony/TelephonyManager;->getNetworkOperator,Landroid/telephony/TelephonyManager;->getNetworkOperatorName,Landroid/telephony/TelephonyManager;->getNetworkCountryIso,Landroid/telephony/TelephonyManager;->getSimOperator,Landroid/telephony/TelephonyManager;->getSimOperatorName,Landroid/telephony/TelephonyManager;->getSimCountryIso,Landroid/telephony/TelephonyManager;->getSimSerialNumber,Lorg/apache/http/impl/client/DefaultHttpClient;->execute,Label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,malware
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,malware
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,malware
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,malware
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,malware


In [254]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(4465, 242)

In [255]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [256]:
# Checking the instance counts of the target attribute
df['Label'].value_counts()

malware     3565
goodware     899
Name: Label, dtype: int64

In [257]:
# Encoding the attributes of the dataset
df = encoding(df)

In [258]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [259]:
# Separating dependent and independent variables
X, y = separating(df, 'Label')

In [260]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [261]:
# Classification and model evaluation
res = classification_evaluation(X_train, X_test, y_train, y_test, classifiers)

In [262]:
res[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Optimized Count-Based Classifier,0.80963,0.80963,0.80963,0.80963,0.5,0.0,11.123571
1,Original Count-Based Classifier,0.80963,0.80963,0.80963,0.80963,0.5,0.0,10.851298
