# Experiment 6: Robustness Analysis (by using Cleanlab for Detecting Label Errors)

In [1]:
# Importing necessary libraries and packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from custom_functions.RepetitiveTasks import imputation, remove_class, encoding, separating, classification_evaluation, oversampling, dataset_error

# 1. Mushroom Dataset

In [2]:
# Loading the dataset
df = pd.read_csv("datasets/mushrooms.csv")

In [3]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [4]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(8124, 23)

In [5]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [6]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

e    4208
p    3916
Name: class, dtype: int64

In [7]:
# Encoding the attributes of the dataset
df = encoding(df)

In [8]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [9]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [10]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Classification and model evaluation
classification_evaluation(X_train, X_test, y_train, y_test)[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,1.0,1.0,1.0,1.0,1.0,1.0,0.011187
1,K-Nearest Neighbours,0.995077,0.995077,0.995077,0.995077,0.995255,0.990192,0.086799
2,Support Vector Machine,0.993231,0.993231,0.993231,0.993231,0.993106,0.986458,1.774661
3,Naive Bayes Classifier,0.966769,0.966769,0.966769,0.966769,0.965658,0.934813,0.008688
4,Logistic Regression,0.947692,0.947692,0.947692,0.947692,0.947734,0.895278,0.185307
5,Multi Layer Perceptron,1.0,1.0,1.0,1.0,1.0,1.0,2.059784
6,AdaBoost Classifier,1.0,1.0,1.0,1.0,1.0,1.0,0.198539
7,Random Forest,1.0,1.0,1.0,1.0,1.0,1.0,0.20199
8,Gradient Boosting,1.0,1.0,1.0,1.0,1.0,1.0,0.390069
9,Extra Trees,1.0,1.0,1.0,1.0,1.0,1.0,0.168355


In [12]:
# Removing dataset label errors and evaluating classifiers' performance
dataset_error(X, y, X_train, X_test, y_train, y_test)

********* Count-Based Classifier *********

Cleanlab found 110 potential label errors

Results after removing label errors from the entire dataset:


Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,1.0,1.0,1.0,1.0,1.0,1.0,0.0097
1,K-Nearest Neighbours,0.995006,0.995006,0.995006,0.995006,0.995255,0.99004,0.232434
2,Support Vector Machine,0.998752,0.998752,0.998752,0.998752,0.998814,0.9975,1.507301
3,Naive Bayes Classifier,0.97191,0.97191,0.97191,0.97191,0.970356,0.945008,0.008703
4,Logistic Regression,0.952559,0.952559,0.952559,0.952559,0.952691,0.904952,0.22562
5,Multi Layer Perceptron,1.0,1.0,1.0,1.0,1.0,1.0,1.582551
6,AdaBoost Classifier,1.0,1.0,1.0,1.0,1.0,1.0,0.201098
7,Random Forest,1.0,1.0,1.0,1.0,1.0,1.0,0.176652
8,Gradient Boosting,1.0,1.0,1.0,1.0,1.0,1.0,0.402133
9,Extra Trees,1.0,1.0,1.0,1.0,1.0,1.0,0.173868



Results after correcting label errors from the entire dataset (applicable for datasets with binary classes only):


Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.999385,0.999385,0.999385,0.999385,0.999423,0.998765,0.013665
1,K-Nearest Neighbours,0.992,0.992,0.992,0.992,0.992494,0.984074,0.071512
2,Support Vector Machine,0.990154,0.990154,0.990154,0.990154,0.990274,0.98024,1.698444
3,Naive Bayes Classifier,0.954462,0.954462,0.954462,0.954462,0.951252,0.911842,0.010377
4,Logistic Regression,0.945846,0.945846,0.945846,0.945846,0.945855,0.891298,0.4328
5,Multi Layer Perceptron,0.995692,0.995692,0.995692,0.995692,0.995714,0.991349,4.289693
6,AdaBoost Classifier,0.997538,0.997538,0.997538,0.997538,0.997691,0.995069,0.227488
7,Random Forest,0.996308,0.996308,0.996308,0.996308,0.996454,0.992598,0.179504
8,Gradient Boosting,0.998769,0.998769,0.998769,0.998769,0.998845,0.997531,0.381624
9,Extra Trees,0.996308,0.996308,0.996308,0.996308,0.996454,0.992598,0.168857


# 2. Car Evaluation Dataset

In [13]:
# Loading the dataset
df = pd.read_csv("datasets/car.csv")

In [14]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [15]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(1728, 7)

In [16]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [17]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

unacc    1210
acc       384
good       69
vgood      65
Name: class, dtype: int64

In [18]:
# Encoding the attributes of the dataset
df = encoding(df)

In [19]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [20]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [21]:
# Handing class imbalance problem using SMOTEN
X, y = oversampling(X,y)

In [22]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
# Classification and model evaluation
classification_evaluation(X_train, X_test, y_train, y_test)[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.995868,0.995868,0.995868,0.995868,0.997364,0.994497,0.007129
1,K-Nearest Neighbours,0.916322,0.916322,0.916322,0.916322,0.981252,0.89183,0.02871
2,Support Vector Machine,0.988636,0.988636,0.988636,0.988636,0.999773,0.98488,0.847215
3,Naive Bayes Classifier,0.872934,0.872934,0.872934,0.872934,0.974651,0.832241,0.006331
4,Logistic Regression,0.554752,0.554752,0.554752,0.554752,0.772337,0.409525,0.02313
5,Multi Layer Perceptron,0.997934,0.997934,0.997934,0.997934,0.999996,0.997246,2.024867
6,AdaBoost Classifier,0.753099,0.753099,0.753099,0.753099,0.881016,0.684228,0.120181
7,Random Forest,0.995868,0.995868,0.995868,0.995868,0.99994,0.994494,0.183053
8,Gradient Boosting,0.989669,0.989669,0.989669,0.989669,0.999441,0.986286,0.69205
9,Extra Trees,0.993802,0.993802,0.993802,0.993802,0.999878,0.991729,0.145251


In [24]:
# Removing dataset label errors and evaluating classifiers' performance
dataset_error(X, y, X_train, X_test, y_train, y_test)

********* Count-Based Classifier *********

Cleanlab found 683 potential label errors

Results after removing label errors from the entire dataset:


Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.991422,0.991422,0.991422,0.991422,0.99395,0.988526,0.006493
1,K-Nearest Neighbours,0.970588,0.970588,0.970588,0.970588,0.991194,0.960977,0.023322
2,Support Vector Machine,0.988971,0.988971,0.988971,0.988971,0.999945,0.985269,0.533297
3,Naive Bayes Classifier,0.933824,0.933824,0.933824,0.933824,0.99702,0.912461,0.006039
4,Logistic Regression,0.675245,0.675245,0.675245,0.675245,0.809373,0.567108,0.022507
5,Multi Layer Perceptron,0.991422,0.991422,0.991422,0.991422,0.999881,0.988521,1.58175
6,AdaBoost Classifier,0.670343,0.670343,0.670343,0.670343,0.837938,0.578017,0.102396
7,Random Forest,0.998775,0.998775,0.998775,0.998775,0.999996,0.998357,0.170042
8,Gradient Boosting,0.997549,0.997549,0.997549,0.997549,1.0,0.996718,0.525859
9,Extra Trees,0.997549,0.997549,0.997549,0.997549,0.999982,0.996719,0.148833


# 3. Breast Cancer Dataset

In [25]:
# Loading the dataset
df = pd.read_csv("datasets/breast-cancer.csv")

In [26]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no


In [27]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(286, 10)

In [28]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [29]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

no-recurrence-events    201
recurrence-events        85
Name: class, dtype: int64

In [30]:
# Encoding the attributes of the dataset
df = encoding(df)

In [31]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [32]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [33]:
# Handing class imbalance problem using SMOTEN
X, y = oversampling(X,y)

In [34]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [35]:
# Classification and model evaluation
classification_evaluation(X_train, X_test, y_train, y_test)[0]

XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.740741,0.740741,0.740741,0.740741,0.736111,0.473686,0.004451
1,K-Nearest Neighbours,0.703704,0.703704,0.703704,0.703704,0.711111,0.421184,0.005604
2,Support Vector Machine,0.703704,0.703704,0.703704,0.703704,0.708333,0.414371,0.022906
3,Naive Bayes Classifier,0.703704,0.703704,0.703704,0.703704,0.7,0.4,0.00385
4,Logistic Regression,0.716049,0.716049,0.716049,0.716049,0.713889,0.426725,0.005593
5,Multi Layer Perceptron,0.691358,0.691358,0.691358,0.691358,0.688889,0.376848,4.63579
6,AdaBoost Classifier,0.679012,0.679012,0.679012,0.679012,0.680556,0.358902,0.047035
7,Random Forest,0.790123,0.790123,0.790123,0.790123,0.783333,0.57307,0.102004
8,Gradient Boosting,0.703704,0.703704,0.703704,0.703704,0.705556,0.408597,0.046179
9,Extra Trees,0.777778,0.777778,0.777778,0.777778,0.766667,0.548795,0.076404


In [36]:
# Removing dataset label errors and evaluating classifiers' performance
dataset_error(X, y, X_train, X_test, y_train, y_test)

********* Count-Based Classifier *********

Cleanlab found 118 potential label errors

Results after removing label errors from the entire dataset:
XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.887097,0.887097,0.887097,0.887097,0.865663,0.755732,0.003917
1,K-Nearest Neighbours,0.903226,0.903226,0.903226,0.903226,0.90524,0.798118,0.004907
2,Support Vector Machine,0.903226,0.903226,0.903226,0.903226,0.896321,0.792642,0.00821
3,Naive Bayes Classifier,0.919355,0.919355,0.919355,0.919355,0.91806,0.829216,0.003656
4,Logistic Regression,0.887097,0.887097,0.887097,0.887097,0.883501,0.760667,0.00469
5,Multi Layer Perceptron,0.887097,0.887097,0.887097,0.887097,0.883501,0.760667,0.365848
6,AdaBoost Classifier,0.951613,0.951613,0.951613,0.951613,0.95262,0.897764,0.044091
7,Random Forest,0.919355,0.919355,0.919355,0.919355,0.909142,0.826149,0.086794
8,Gradient Boosting,0.919355,0.919355,0.919355,0.919355,0.909142,0.826149,0.046434
9,Extra Trees,0.935484,0.935484,0.935484,0.935484,0.930881,0.861761,0.075686



Results after correcting label errors from the entire dataset (applicable for datasets with binary classes only):
XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.925926,0.925926,0.925926,0.925926,0.927056,0.84246,0.003908
1,K-Nearest Neighbours,0.901235,0.901235,0.901235,0.901235,0.900199,0.789479,0.005201
2,Support Vector Machine,0.888889,0.888889,0.888889,0.888889,0.875332,0.756709,0.014949
3,Naive Bayes Classifier,0.91358,0.91358,0.91358,0.91358,0.909814,0.813715,0.004464
4,Logistic Regression,0.901235,0.901235,0.901235,0.901235,0.900199,0.789479,0.006741
5,Multi Layer Perceptron,0.925926,0.925926,0.925926,0.925926,0.91943,0.838859,0.880546
6,AdaBoost Classifier,0.925926,0.925926,0.925926,0.925926,0.91943,0.838859,0.061191
7,Random Forest,0.938272,0.938272,0.938272,0.938272,0.936671,0.867041,0.108193
8,Gradient Boosting,0.950617,0.950617,0.950617,0.950617,0.946286,0.892573,0.046787
9,Extra Trees,0.938272,0.938272,0.938272,0.938272,0.936671,0.867041,0.060627


# 4. Congressional Voting Records Dataset

In [37]:
# Loading the dataset
df = pd.read_csv("datasets/house-votes-84.csv")

In [38]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,class,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,synfuels-corporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
0,republican,n,y,n,y,y,y,n,n,n,y,?,y,y,y,n,y
1,republican,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,?
2,democrat,?,y,y,?,y,y,n,n,n,n,y,n,y,y,n,n
3,democrat,n,y,y,n,?,y,n,n,n,n,y,n,y,n,n,y
4,democrat,y,y,y,n,y,y,n,n,n,n,y,?,y,y,y,y


In [39]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(435, 17)

In [40]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [41]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

democrat      267
republican    168
Name: class, dtype: int64

In [42]:
# Encoding the attributes of the dataset
df = encoding(df)

In [43]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [44]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [45]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [46]:
# Classification and model evaluation
classification_evaluation(X_train, X_test, y_train, y_test)[0]

XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.942529,0.942529,0.942529,0.942529,0.926555,0.874531,0.013426
1,K-Nearest Neighbours,0.931034,0.931034,0.931034,0.931034,0.924827,0.849654,0.050547
2,Support Vector Machine,0.977011,0.977011,0.977011,0.977011,0.974942,0.949885,0.019197
3,Naive Bayes Classifier,0.896552,0.896552,0.896552,0.896552,0.890841,0.776334,0.00844
4,Logistic Regression,0.988506,0.988506,0.988506,0.988506,0.983871,0.975071,0.004786
5,Multi Layer Perceptron,0.988506,0.988506,0.988506,0.988506,0.983871,0.975071,1.938871
6,AdaBoost Classifier,0.965517,0.965517,0.965517,0.965517,0.951613,0.925904,0.049384
7,Random Forest,0.977011,0.977011,0.977011,0.977011,0.974942,0.949885,0.102221
8,Gradient Boosting,0.965517,0.965517,0.965517,0.965517,0.951613,0.925904,0.061303
9,Extra Trees,0.977011,0.977011,0.977011,0.977011,0.974942,0.949885,0.066567


In [47]:
# Removing dataset label errors and evaluating classifiers' performance
dataset_error(X, y, X_train, X_test, y_train, y_test)

********* Count-Based Classifier *********

Cleanlab found 5 potential label errors

Results after removing label errors from the entire dataset:
XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.931034,0.931034,0.931034,0.931034,0.917627,0.848555,0.005333
1,K-Nearest Neighbours,0.931034,0.931034,0.931034,0.931034,0.924827,0.849654,0.043456
2,Support Vector Machine,0.965517,0.965517,0.965517,0.965517,0.966014,0.925651,0.022493
3,Naive Bayes Classifier,0.896552,0.896552,0.896552,0.896552,0.890841,0.776334,0.004379
4,Logistic Regression,0.965517,0.965517,0.965517,0.965517,0.966014,0.925651,0.018751
5,Multi Layer Perceptron,0.965517,0.965517,0.965517,0.965517,0.958813,0.924577,1.270464
6,AdaBoost Classifier,0.965517,0.965517,0.965517,0.965517,0.958813,0.924577,0.060482
7,Random Forest,0.977011,0.977011,0.977011,0.977011,0.982143,0.951758,0.105186
8,Gradient Boosting,0.988506,0.988506,0.988506,0.988506,0.991071,0.975423,0.066846
9,Extra Trees,0.977011,0.977011,0.977011,0.977011,0.982143,0.951758,0.062347



Results after correcting label errors from the entire dataset (applicable for datasets with binary classes only):
XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.942529,0.942529,0.942529,0.942529,0.933756,0.874082,0.005486
1,K-Nearest Neighbours,0.931034,0.931034,0.931034,0.931034,0.924827,0.849654,0.008375
2,Support Vector Machine,0.965517,0.965517,0.965517,0.965517,0.966014,0.925651,0.013212
3,Naive Bayes Classifier,0.896552,0.896552,0.896552,0.896552,0.890841,0.776334,0.004483
4,Logistic Regression,0.965517,0.965517,0.965517,0.965517,0.966014,0.925651,0.004809
5,Multi Layer Perceptron,0.965517,0.965517,0.965517,0.965517,0.958813,0.924577,1.835461
6,AdaBoost Classifier,0.965517,0.965517,0.965517,0.965517,0.958813,0.924577,0.071666
7,Random Forest,0.977011,0.977011,0.977011,0.977011,0.982143,0.951758,0.104764
8,Gradient Boosting,0.977011,0.977011,0.977011,0.977011,0.982143,0.951758,0.069094
9,Extra Trees,0.977011,0.977011,0.977011,0.977011,0.982143,0.951758,0.05299


# 5. Tic-Tac-Toe Endgame Dataset

In [48]:
# Loading the dataset
df = pd.read_csv("datasets/tic-tac-toe.csv")

In [49]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,top-left-square,top-middle-square,top-right-square,middle-left-square,middle-middle-square,middle-right-square,bottom-left-square,bottom-middle-square,bottom-right-square,class
0,x,x,x,x,o,o,x,o,o,positive
1,x,x,x,x,o,o,o,x,o,positive
2,x,x,x,x,o,o,o,o,x,positive
3,x,x,x,x,o,o,o,b,b,positive
4,x,x,x,x,o,o,b,o,b,positive


In [50]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(958, 10)

In [51]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [52]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

positive    626
negative    332
Name: class, dtype: int64

In [53]:
# Encoding the attributes of the dataset
df = encoding(df)

In [54]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [55]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [56]:
# Handing class imbalance problem using SMOTEN
X, y = oversampling(X,y)

In [57]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [58]:
# Classification and model evaluation
classification_evaluation(X_train, X_test, y_train, y_test)[0]

XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.948207,0.948207,0.948207,0.948207,0.948222,0.896444,0.005639
1,K-Nearest Neighbours,0.804781,0.804781,0.804781,0.804781,0.804444,0.618021,0.01137
2,Support Vector Machine,0.948207,0.948207,0.948207,0.948207,0.948317,0.897833,0.176388
3,Naive Bayes Classifier,0.673307,0.673307,0.673307,0.673307,0.673302,0.346603,0.004292
4,Logistic Regression,0.625498,0.625498,0.625498,0.625498,0.625841,0.255457,0.004861
5,Multi Layer Perceptron,0.800797,0.800797,0.800797,0.800797,0.800952,0.603632,1.822042
6,AdaBoost Classifier,0.741036,0.741036,0.741036,0.741036,0.741175,0.483456,0.064106
7,Random Forest,0.976096,0.976096,0.976096,0.976096,0.976095,0.95219,0.123388
8,Gradient Boosting,0.896414,0.896414,0.896414,0.896414,0.896413,0.792825,0.081163
9,Extra Trees,0.972112,0.972112,0.972112,0.972112,0.972159,0.944497,0.076709


In [59]:
# Removing dataset label errors and evaluating classifiers' performance
dataset_error(X, y, X_train, X_test, y_train, y_test)

********* Count-Based Classifier *********

Cleanlab found 507 potential label errors

Results after removing label errors from the entire dataset:
XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.934211,0.934211,0.934211,0.934211,0.930672,0.86744,0.004218
1,K-Nearest Neighbours,0.934211,0.934211,0.934211,0.934211,0.933473,0.866947,0.007214
2,Support Vector Machine,0.960526,0.960526,0.960526,0.960526,0.960084,0.920168,0.048999
3,Naive Bayes Classifier,0.953947,0.953947,0.953947,0.953947,0.956933,0.909497,0.003891
4,Logistic Regression,0.756579,0.756579,0.756579,0.756579,0.757353,0.512246,0.004631
5,Multi Layer Perceptron,0.967105,0.967105,0.967105,0.967105,0.966036,0.933463,0.934174
6,AdaBoost Classifier,1.0,1.0,1.0,1.0,1.0,1.0,0.053227
7,Random Forest,0.960526,0.960526,0.960526,0.960526,0.955882,0.922486,0.106841
8,Gradient Boosting,0.973684,0.973684,0.973684,0.973684,0.973389,0.946779,0.065416
9,Extra Trees,0.980263,0.980263,0.980263,0.980263,0.977941,0.960688,0.065822



Results after correcting label errors from the entire dataset (applicable for datasets with binary classes only):
XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.864542,0.864542,0.864542,0.864542,0.867489,0.729578,0.004316
1,K-Nearest Neighbours,0.808765,0.808765,0.808765,0.808765,0.809665,0.615368,0.011176
2,Support Vector Machine,0.864542,0.864542,0.864542,0.864542,0.871753,0.737106,0.149048
3,Naive Bayes Classifier,0.896414,0.896414,0.896414,0.896414,0.902055,0.797287,0.004013
4,Logistic Regression,0.713147,0.713147,0.713147,0.713147,0.714498,0.425546,0.004516
5,Multi Layer Perceptron,0.812749,0.812749,0.812749,0.812749,0.819583,0.633743,0.827705
6,AdaBoost Classifier,0.916335,0.916335,0.916335,0.916335,0.922858,0.838435,0.068517
7,Random Forest,0.900398,0.900398,0.900398,0.900398,0.900246,0.798174,0.126804
8,Gradient Boosting,0.876494,0.876494,0.876494,0.876494,0.881251,0.756177,0.077303
9,Extra Trees,0.880478,0.880478,0.880478,0.880478,0.881574,0.759185,0.076009


# 6. Nursery Dataset

In [60]:
# Loading the dataset
df = pd.read_csv("datasets/nursery.csv")

In [61]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,parents,has_nurs,form,children,housing,finance,social,health,class
0,usual,proper,complete,1,convenient,convenient,nonprob,recommended,recommend
1,usual,proper,complete,1,convenient,convenient,nonprob,priority,priority
2,usual,proper,complete,1,convenient,convenient,nonprob,not_recom,not_recom
3,usual,proper,complete,1,convenient,convenient,slightly_prob,recommended,recommend
4,usual,proper,complete,1,convenient,convenient,slightly_prob,priority,priority


In [62]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(12960, 9)

In [63]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [64]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

not_recom     4320
priority      4266
spec_prior    4044
very_recom     328
recommend        2
Name: class, dtype: int64

In [65]:
# Removing classes having <= 4 instances
df = remove_class(df,'class')

In [66]:
# Encoding the attributes of the dataset
df = encoding(df)

In [67]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [68]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [69]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [70]:
# Classification and model evaluation
classification_evaluation(X_train, X_test, y_train, y_test)[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.997685,0.997685,0.997685,0.997685,0.996848,0.996606,0.013629
1,K-Nearest Neighbours,0.939043,0.939043,0.939043,0.939043,0.985792,0.910301,0.096007
2,Support Vector Machine,0.951389,0.951389,0.951389,0.951389,0.995864,0.928532,5.526364
3,Naive Bayes Classifier,0.912809,0.912809,0.912809,0.912809,0.986501,0.871808,0.011046
4,Logistic Regression,0.769676,0.769676,0.769676,0.769676,0.9209,0.659953,0.462635
5,Multi Layer Perceptron,1.0,1.0,1.0,1.0,1.0,1.0,4.596569
6,AdaBoost Classifier,0.83912,0.83912,0.83912,0.83912,0.941626,0.773875,0.262693
7,Random Forest,0.983796,0.983796,0.983796,0.983796,0.999351,0.976258,0.337092
8,Gradient Boosting,0.980324,0.980324,0.980324,0.980324,0.999394,0.971168,1.689429
9,Extra Trees,0.978781,0.978781,0.978781,0.978781,0.998134,0.968902,0.392128


In [71]:
# Removing dataset label errors and evaluating classifiers' performance
dataset_error(X, y, X_train, X_test, y_train, y_test)

********* Count-Based Classifier *********

Cleanlab found 1252 potential label errors

Results after removing label errors from the entire dataset:
Decision Tree : Number of classes in y_true not equal to the number of columns in 'y_score'
K-Nearest Neighbours : Number of classes in y_true not equal to the number of columns in 'y_score'
Support Vector Machine : Number of classes in y_true not equal to the number of columns in 'y_score'
Naive Bayes Classifier : Number of classes in y_true not equal to the number of columns in 'y_score'
Logistic Regression : Number of classes in y_true not equal to the number of columns in 'y_score'
Multi Layer Perceptron : Number of classes in y_true not equal to the number of columns in 'y_score'
AdaBoost Classifier : Number of classes in y_true not equal to the number of columns in 'y_score'
Random Forest : Number of classes in y_true not equal to the number of columns in 'y_score'
Gradient Boosting : Number of classes in y_true not equal to the numb

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT


# 7. Soybean (Large) Dataset

In [72]:
# Loading the dataset
df = pd.read_csv("datasets/soybean-large.csv")

In [73]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,class,date,plant-stand,precip,temp,hail,crop-hist,area-damaged,severity,seed-tmt,...,int-discolor,sclerotia,fruit-pods,fruit spots,seed,mold-growth,seed-discolor,seed-size,shriveling,roots
0,diaporthe-stem-canker,6,0,2,1,0,1,1,1,0,...,0,0,0,4,0,0,0,0,0,0
1,diaporthe-stem-canker,4,0,2,1,0,2,0,2,1,...,0,0,0,4,0,0,0,0,0,0
2,diaporthe-stem-canker,3,0,2,1,0,1,0,2,1,...,0,0,0,4,0,0,0,0,0,0
3,diaporthe-stem-canker,3,0,2,1,0,1,0,2,0,...,0,0,0,4,0,0,0,0,0,0
4,diaporthe-stem-canker,6,0,2,1,0,2,0,1,0,...,0,0,0,4,0,0,0,0,0,0


In [74]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(307, 36)

In [75]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [76]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

frog-eye-leaf-spot             40
phytophthora-rot               40
alternarialeaf-spot            40
brown-spot                     40
brown-stem-rot                 20
anthracnose                    20
diaporthe-stem-canker          10
purple-seed-stain              10
phyllosticta-leaf-spot         10
bacterial-pustule              10
charcoal-rot                   10
bacterial-blight               10
downy-mildew                   10
powdery-mildew                 10
rhizoctonia-root-rot           10
diaporthe-pod-&-stem-blight     6
cyst-nematode                   6
herbicide-injury                4
2-4-d-injury                    1
Name: class, dtype: int64

In [77]:
# Removing classes having <= 4 instances
df = remove_class(df,'class')

In [78]:
# Encoding the attributes of the dataset
df = encoding(df)

In [79]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [80]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [81]:
# Handing class imbalance problem using SMOTEN
X, y = oversampling(X,y)

In [82]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [83]:
# Classification and model evaluation
classification_evaluation(X_train, X_test, y_train, y_test)[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.941176,0.941176,0.941176,0.941176,0.969473,0.937536,0.00785
1,K-Nearest Neighbours,0.941176,0.941176,0.941176,0.941176,0.999038,0.937345,0.013263
2,Support Vector Machine,0.970588,0.970588,0.970588,0.970588,0.998573,0.968786,0.082286
3,Naive Bayes Classifier,0.955882,0.955882,0.955882,0.955882,0.998584,0.952987,0.009837
4,Logistic Regression,0.933824,0.933824,0.933824,0.933824,0.998692,0.929707,0.162248
5,Multi Layer Perceptron,0.941176,0.941176,0.941176,0.941176,0.998041,0.937869,1.144713
6,AdaBoost Classifier,0.308824,0.308824,0.308824,0.308824,0.800248,0.36097,0.082285
7,Random Forest,0.985294,0.985294,0.985294,0.985294,0.999441,0.984335,0.116096
8,Gradient Boosting,0.970588,0.970588,0.970588,0.970588,0.999244,0.969117,1.043208
9,Extra Trees,0.977941,0.977941,0.977941,0.977941,0.999335,0.976477,0.061011


In [84]:
# Removing dataset label errors and evaluating classifiers' performance
dataset_error(X, y, X_train, X_test, y_train, y_test)

********* Count-Based Classifier *********

Cleanlab found 20 potential label errors

Results after removing label errors from the entire dataset:


Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.992424,0.992424,0.992424,0.992424,0.996818,0.991956,0.007517
1,K-Nearest Neighbours,0.969697,0.969697,0.969697,0.969697,0.999757,0.967889,0.040344
2,Support Vector Machine,0.984848,0.984848,0.984848,0.984848,0.999952,0.984039,0.103406
3,Naive Bayes Classifier,0.992424,0.992424,0.992424,0.992424,1.0,0.991953,0.022025
4,Logistic Regression,0.992424,0.992424,0.992424,0.992424,0.999952,0.991958,0.314759
5,Multi Layer Perceptron,0.992424,0.992424,0.992424,0.992424,1.0,0.991958,1.759603
6,AdaBoost Classifier,0.378788,0.378788,0.378788,0.378788,0.868511,0.404829,0.068865
7,Random Forest,1.0,1.0,1.0,1.0,1.0,1.0,0.116188
8,Gradient Boosting,1.0,1.0,1.0,1.0,1.0,1.0,1.026585
9,Extra Trees,1.0,1.0,1.0,1.0,1.0,1.0,0.058486


# 8. Molecular Biology (Promoter Gene Sequences) Dataset

In [85]:
# Loading the dataset
df = pd.read_csv("datasets/promoters.csv")

In [86]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,class,instance_name,sequence
0,+,S10,\t\ttactagcaatacgcttgcgttcggtggttaagtatgtataat...
1,+,AMPC,\t\ttgctatcctgacagttgtcacgctgattggtgtcgttacaat...
2,+,AROH,\t\tgtactagagaactagtgcattagcttatttttttgttatcat...
3,+,DEOP2,\taattgtgatgtgtatcgaagtgtgttgcggagtagatgttagaa...
4,+,LEU1_TRNA,\ttcgataattaactattgacgaaaagctgaaaaccactagaatgc...


In [87]:
# Formatting columns to create a proper dataset
df['sequence'] = df['sequence'].apply(lambda x: x.replace('\t',''))
attributes = [str(i) for i in range(0,59)]
df[attributes] = df['sequence'].str.split(pat='', expand=True)
df.drop(['sequence','0','58'],axis=1,inplace=True)

In [88]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,class,instance_name,1,2,3,4,5,6,7,8,...,48,49,50,51,52,53,54,55,56,57
0,+,S10,t,a,c,t,a,g,c,a,...,g,g,c,t,t,g,t,c,g,t
1,+,AMPC,t,g,c,t,a,t,c,c,...,g,c,a,t,c,g,c,c,a,a
2,+,AROH,g,t,a,c,t,a,g,a,...,c,c,a,c,c,c,g,g,c,g
3,+,DEOP2,a,a,t,t,g,t,g,a,...,t,a,a,c,a,a,a,c,t,c
4,+,LEU1_TRNA,t,c,g,a,t,a,a,t,...,t,c,c,g,t,g,g,t,a,g


In [89]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(106, 59)

In [90]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [91]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

+    53
-    53
Name: class, dtype: int64

In [92]:
# Encoding the attributes of the dataset
df = encoding(df)

In [93]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [94]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [95]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [96]:
# Classification and model evaluation
classification_evaluation(X_train, X_test, y_train, y_test)[0]

XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,1.0,1.0,1.0,1.0,1.0,1.0,0.004552
1,K-Nearest Neighbours,1.0,1.0,1.0,1.0,1.0,1.0,0.008895
2,Support Vector Machine,1.0,1.0,1.0,1.0,1.0,1.0,0.005925
3,Naive Bayes Classifier,0.909091,0.909091,0.909091,0.909091,0.909091,0.83205,0.005745
4,Logistic Regression,1.0,1.0,1.0,1.0,1.0,1.0,0.009815
5,Multi Layer Perceptron,1.0,1.0,1.0,1.0,1.0,1.0,0.222806
6,AdaBoost Classifier,1.0,1.0,1.0,1.0,1.0,1.0,0.00438
7,Random Forest,1.0,1.0,1.0,1.0,1.0,1.0,0.083604
8,Gradient Boosting,1.0,1.0,1.0,1.0,1.0,1.0,0.024887
9,Extra Trees,0.954545,0.954545,0.954545,0.954545,0.954545,0.912871,0.071701


In [97]:
# Removing dataset label errors and evaluating classifiers' performance
dataset_error(X, y, X_train, X_test, y_train, y_test)

********* Count-Based Classifier *********

Cleanlab found 1 potential label errors

Results after removing label errors from the entire dataset:
XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,1.0,1.0,1.0,1.0,1.0,1.0,0.00466
1,K-Nearest Neighbours,1.0,1.0,1.0,1.0,1.0,1.0,0.019427
2,Support Vector Machine,1.0,1.0,1.0,1.0,1.0,1.0,0.009188
3,Naive Bayes Classifier,0.909091,0.909091,0.909091,0.909091,0.909091,0.83205,0.005576
4,Logistic Regression,1.0,1.0,1.0,1.0,1.0,1.0,0.016665
5,Multi Layer Perceptron,0.954545,0.954545,0.954545,0.954545,0.954545,0.912871,0.298235
6,AdaBoost Classifier,1.0,1.0,1.0,1.0,1.0,1.0,0.006412
7,Random Forest,0.954545,0.954545,0.954545,0.954545,0.954545,0.912871,0.102021
8,Gradient Boosting,1.0,1.0,1.0,1.0,1.0,1.0,0.03821
9,Extra Trees,0.954545,0.954545,0.954545,0.954545,0.954545,0.912871,0.097444



Results after correcting label errors from the entire dataset (applicable for datasets with binary classes only):
XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.954545,0.954545,0.954545,0.954545,0.954545,0.912871,0.005061
1,K-Nearest Neighbours,1.0,1.0,1.0,1.0,1.0,1.0,0.007307
2,Support Vector Machine,1.0,1.0,1.0,1.0,1.0,1.0,0.005568
3,Naive Bayes Classifier,0.909091,0.909091,0.909091,0.909091,0.909091,0.83205,0.005559
4,Logistic Regression,1.0,1.0,1.0,1.0,1.0,1.0,0.010913
5,Multi Layer Perceptron,1.0,1.0,1.0,1.0,1.0,1.0,0.284181
6,AdaBoost Classifier,1.0,1.0,1.0,1.0,1.0,1.0,0.042
7,Random Forest,1.0,1.0,1.0,1.0,1.0,1.0,0.087523
8,Gradient Boosting,1.0,1.0,1.0,1.0,1.0,1.0,0.048887
9,Extra Trees,1.0,1.0,1.0,1.0,1.0,1.0,0.06837


# 9. Balance Scale Dataset

In [98]:
# Loading the dataset
df = pd.read_csv("datasets/balance-scale.csv")

In [99]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,class,left-weight,left-distance,right-weight,right-distance
0,B,1,1,1,1
1,R,1,1,1,2
2,R,1,1,1,3
3,R,1,1,1,4
4,R,1,1,1,5


In [100]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(625, 5)

In [101]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [102]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

R    288
L    288
B     49
Name: class, dtype: int64

In [103]:
# Encoding the attributes of the dataset
df = encoding(df)

In [104]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [105]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [106]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [107]:
# Classification and model evaluation
classification_evaluation(X_train, X_test, y_train, y_test)[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.752,0.752,0.752,0.752,0.713733,0.580206,0.004984
1,K-Nearest Neighbours,0.816,0.816,0.816,0.816,0.820842,0.682363,0.006856
2,Support Vector Machine,0.904,0.904,0.904,0.904,0.953061,0.835157,0.033845
3,Naive Bayes Classifier,0.896,0.896,0.896,0.896,0.842315,0.820935,0.005956
4,Logistic Regression,0.84,0.84,0.84,0.84,0.954595,0.733634,0.00919
5,Multi Layer Perceptron,0.984,0.984,0.984,0.984,0.997873,0.97243,0.469222
6,AdaBoost Classifier,0.936,0.936,0.936,0.936,0.858374,0.899371,0.031925
7,Random Forest,0.808,0.808,0.808,0.808,0.834898,0.666174,0.06232
8,Gradient Boosting,0.888,0.888,0.888,0.888,0.903488,0.802793,0.10434
9,Extra Trees,0.8,0.8,0.8,0.8,0.769144,0.650974,0.055898


In [108]:
# Removing dataset label errors and evaluating classifiers' performance
dataset_error(X, y, X_train, X_test, y_train, y_test)

********* Count-Based Classifier *********

Cleanlab found 88 potential label errors

Results after removing label errors from the entire dataset:


Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.754717,0.754717,0.754717,0.754717,0.744676,0.601393,0.004334
1,K-Nearest Neighbours,0.858491,0.858491,0.858491,0.858491,0.90377,0.759877,0.006003
2,Support Vector Machine,0.924528,0.924528,0.924528,0.924528,0.96921,0.871729,0.020393
3,Naive Bayes Classifier,0.896226,0.896226,0.896226,0.896226,0.865628,0.827004,0.003845
4,Logistic Regression,0.95283,0.95283,0.95283,0.95283,0.986001,0.921112,0.005848
5,Multi Layer Perceptron,0.981132,0.981132,0.981132,0.981132,0.998405,0.968187,0.325776
6,AdaBoost Classifier,0.924528,0.924528,0.924528,0.924528,0.892343,0.880582,0.030843
7,Random Forest,0.867925,0.867925,0.867925,0.867925,0.878451,0.776326,0.061357
8,Gradient Boosting,0.858491,0.858491,0.858491,0.858491,0.925058,0.755571,0.097648
9,Extra Trees,0.849057,0.849057,0.849057,0.849057,0.826919,0.743712,0.052386


# 10. Lenses Dataset

In [109]:
# Loading the dataset
df = pd.read_csv("datasets/lenses.csv", delimiter="  ", engine="python")

In [110]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,class,age,spectacle_prescription,astigmatic,tear_production_rate
1,1,1,1,1,3.0
2,1,1,1,2,2.0
3,1,1,2,1,3.0
4,1,1,2,2,1.0
5,1,2,1,1,3.0


In [111]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(24, 5)

In [112]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [113]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

1    15
2     9
Name: class, dtype: int64

In [114]:
# Encoding the attributes of the dataset
df = encoding(df)

In [115]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [116]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [117]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [118]:
# Classification and model evaluation
classification_evaluation(X_train, X_test, y_train, y_test)[0]

XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.4,0.4,0.4,0.4,0.25,-0.408248,0.006096
1,K-Nearest Neighbours,0.4,0.4,0.4,0.4,0.25,-0.408248,0.004697
2,Support Vector Machine,0.4,0.4,0.4,0.4,0.25,-0.408248,0.003586
3,Naive Bayes Classifier,0.4,0.4,0.4,0.4,0.25,-0.408248,0.00471
4,Logistic Regression,0.4,0.4,0.4,0.4,0.25,-0.408248,0.004083
5,Multi Layer Perceptron,0.4,0.4,0.4,0.4,0.25,-0.408248,0.070902
6,AdaBoost Classifier,0.4,0.4,0.4,0.4,0.25,-0.408248,0.027735
7,Random Forest,0.4,0.4,0.4,0.4,0.25,-0.408248,0.051564
8,Gradient Boosting,0.4,0.4,0.4,0.4,0.25,-0.408248,0.02
9,Extra Trees,0.4,0.4,0.4,0.4,0.25,-0.408248,0.041917


In [119]:
# Removing dataset label errors and evaluating classifiers' performance
dataset_error(X, y, X_train, X_test, y_train, y_test)

********* Count-Based Classifier *********





Cleanlab found 4 potential label errors

Results after removing label errors from the entire dataset:
Decision Tree : Only one class present in y_true. ROC AUC score is not defined in that case.
K-Nearest Neighbours : Only one class present in y_true. ROC AUC score is not defined in that case.
Support Vector Machine : Only one class present in y_true. ROC AUC score is not defined in that case.
Naive Bayes Classifier : Only one class present in y_true. ROC AUC score is not defined in that case.
Logistic Regression : Only one class present in y_true. ROC AUC score is not defined in that case.
Multi Layer Perceptron : Only one class present in y_true. ROC AUC score is not defined in that case.
AdaBoost Classifier : Only one class present in y_true. ROC AUC score is not defined in that case.
Random Forest : Only one class present in y_true. ROC AUC score is not defined in that case.
Gradient Boosting : Only one class present in y_true. ROC AUC score is not defined in that case.
Extra Trees

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT



Results after correcting label errors from the entire dataset (applicable for datasets with binary classes only):
Decision Tree : Only one class present in y_true. ROC AUC score is not defined in that case.
K-Nearest Neighbours : Only one class present in y_true. ROC AUC score is not defined in that case.
Support Vector Machine : Only one class present in y_true. ROC AUC score is not defined in that case.
Naive Bayes Classifier : Only one class present in y_true. ROC AUC score is not defined in that case.
Logistic Regression : Only one class present in y_true. ROC AUC score is not defined in that case.
Multi Layer Perceptron : Only one class present in y_true. ROC AUC score is not defined in that case.
AdaBoost Classifier : Only one class present in y_true. ROC AUC score is not defined in that case.
Random Forest : Only one class present in y_true. ROC AUC score is not defined in that case.
Gradient Boosting : Only one class present in y_true. ROC AUC score is not defined in that case

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT


# 11. Molecular Biology (Splice-junction Gene Sequences) Dataset

In [120]:
# Loading the dataset
df = pd.read_csv("datasets/splice.csv")

In [121]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,class,instance_name,sequence
0,EI,ATRINS-DONOR-521,CCAGCTGCATCACAGGAGGCCAGCGAGCAGG...
1,EI,ATRINS-DONOR-905,AGACCCGCCGGGAGGCGGAGGACCTGCAGGG...
2,EI,BABAPOE-DONOR-30,GAGGTGAAGGACGTCCTTCCCCAGGAGCCGG...
3,EI,BABAPOE-DONOR-867,GGGCTGCGTTGCTGGTCACATTCCTGGCAGGT...
4,EI,BABAPOE-DONOR-2817,GCTCAGCCCCCAGGTCACCCAGGAACTGACGTG...


In [122]:
# Formatting columns to create a proper dataset
df['sequence'] = df['sequence'].apply(lambda x: x.replace(' ',''))
attributes = [str(i) for i in range(0,62)]
df[attributes] = df['sequence'].str.split(pat='', expand=True)
df.drop(['sequence','0','61'],axis=1,inplace=True)

In [123]:
# Dropping unwanted columns
df.drop(['instance_name'], axis=1, inplace=True)

In [124]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,class,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
0,EI,C,C,A,G,C,T,G,C,A,...,A,G,C,C,A,G,T,C,T,G
1,EI,A,G,A,C,C,C,G,C,C,...,G,T,G,C,C,C,C,C,G,C
2,EI,G,A,G,G,T,G,A,A,G,...,C,A,C,G,G,G,G,A,T,G
3,EI,G,G,G,C,T,G,C,G,T,...,G,G,T,T,T,T,C,C,C,C
4,EI,G,C,T,C,A,G,C,C,C,...,C,C,T,T,G,A,C,C,C,T


In [125]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(3190, 61)

In [126]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [127]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

N     1655
IE     768
EI     767
Name: class, dtype: int64

In [128]:
# Encoding the attributes of the dataset
df = encoding(df)

In [129]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [130]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [131]:
# Handing class imbalance problem using SMOTEN
X, y = oversampling(X,y)

In [132]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [133]:
# Classification and model evaluation
classification_evaluation(X_train, X_test, y_train, y_test)[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.926485,0.926485,0.926485,0.926485,0.944744,0.889752,0.032036
1,K-Nearest Neighbours,0.749245,0.749245,0.749245,0.749245,0.930619,0.65751,0.140609
2,Support Vector Machine,0.916415,0.916415,0.916415,0.916415,0.981226,0.87481,3.580277
3,Naive Bayes Classifier,0.960725,0.960725,0.960725,0.960725,0.996101,0.941102,0.011866
4,Logistic Regression,0.878147,0.878147,0.878147,0.878147,0.963616,0.817638,0.254167
5,Multi Layer Perceptron,0.90433,0.90433,0.90433,0.90433,0.981682,0.856737,4.594161
6,AdaBoost Classifier,0.944612,0.944612,0.944612,0.944612,0.970434,0.917204,0.317855
7,Random Forest,0.970796,0.970796,0.970796,0.970796,0.996765,0.956341,0.346277
8,Gradient Boosting,0.971803,0.971803,0.971803,0.971803,0.997768,0.957977,2.791913
9,Extra Trees,0.977845,0.977845,0.977845,0.977845,0.997293,0.96679,0.352638


In [134]:
# Removing dataset label errors and evaluating classifiers' performance
dataset_error(X, y, X_train, X_test, y_train, y_test)

********* Count-Based Classifier *********

Cleanlab found 17 potential label errors

Results after removing label errors from the entire dataset:


Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.934277,0.934277,0.934277,0.934277,0.950672,0.901419,0.029552
1,K-Nearest Neighbours,0.743175,0.743175,0.743175,0.743175,0.931049,0.650558,0.144753
2,Support Vector Machine,0.91911,0.91911,0.91911,0.91911,0.983002,0.878818,3.561145
3,Naive Bayes Classifier,0.964611,0.964611,0.964611,0.964611,0.997433,0.946954,0.011568
4,Logistic Regression,0.878665,0.878665,0.878665,0.878665,0.965181,0.818461,0.68295
5,Multi Layer Perceptron,0.904954,0.904954,0.904954,0.904954,0.980678,0.85769,4.445339
6,AdaBoost Classifier,0.938322,0.938322,0.938322,0.938322,0.970083,0.908599,0.286869
7,Random Forest,0.968655,0.968655,0.968655,0.968655,0.997761,0.953211,0.338255
8,Gradient Boosting,0.977755,0.977755,0.977755,0.977755,0.998609,0.966838,2.781944
9,Extra Trees,0.984833,0.984833,0.984833,0.984833,0.998664,0.977286,0.349299


# 12. SPECT Heart Dataset

In [135]:
# Loading the dataset
df = pd.read_csv("datasets/SPECT.csv")

In [136]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,class,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22
0,1,0,0,0,1,0,0,0,1,1,...,1,1,0,0,0,0,0,0,0,0
1,1,0,0,1,1,0,0,0,1,1,...,1,1,0,0,0,0,0,0,0,1
2,1,1,0,1,0,1,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,1
4,1,0,0,0,0,0,0,0,1,0,...,1,0,1,1,0,0,0,0,0,0


In [137]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(267, 23)

In [138]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [139]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

1    212
0     55
Name: class, dtype: int64

In [140]:
# Encoding the attributes of the dataset
df = encoding(df)

In [141]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [142]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [143]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [144]:
# Classification and model evaluation
classification_evaluation(X_train, X_test, y_train, y_test)[0]

XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.703704,0.703704,0.703704,0.703704,0.6,0.16641,0.004461
1,K-Nearest Neighbours,0.722222,0.722222,0.722222,0.722222,0.611111,0.188982,0.008383
2,Support Vector Machine,0.740741,0.740741,0.740741,0.740741,0.533333,0.066667,0.010961
3,Naive Bayes Classifier,0.777778,0.777778,0.777778,0.777778,0.777778,0.445789,0.005432
4,Logistic Regression,0.759259,0.759259,0.759259,0.759259,0.544444,0.09325,0.004516
5,Multi Layer Perceptron,0.796296,0.796296,0.796296,0.796296,0.566667,0.158114,0.849296
6,AdaBoost Classifier,0.759259,0.759259,0.759259,0.759259,0.588889,0.170561,0.045219
7,Random Forest,0.759259,0.759259,0.759259,0.759259,0.588889,0.170561,0.085742
8,Gradient Boosting,0.777778,0.777778,0.777778,0.777778,0.555556,0.123278,0.047036
9,Extra Trees,0.759259,0.759259,0.759259,0.759259,0.588889,0.170561,0.063137


In [145]:
# Removing dataset label errors and evaluating classifiers' performance
dataset_error(X, y, X_train, X_test, y_train, y_test)

********* Count-Based Classifier *********

Cleanlab found 4 potential label errors

Results after removing label errors from the entire dataset:
XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.641509,0.641509,0.641509,0.641509,0.531944,0.049001,0.004097
1,K-Nearest Neighbours,0.735849,0.735849,0.735849,0.735849,0.638889,0.225555,0.025399
2,Support Vector Machine,0.754717,0.754717,0.754717,0.754717,0.547222,0.090049,0.013219
3,Naive Bayes Classifier,0.811321,0.811321,0.811321,0.811321,0.8375,0.526374,0.005856
4,Logistic Regression,0.735849,0.735849,0.735849,0.735849,0.536111,0.066083,0.004629
5,Multi Layer Perceptron,0.792453,0.792453,0.792453,0.792453,0.569444,0.146856,0.652455
6,AdaBoost Classifier,0.792453,0.792453,0.792453,0.792453,0.569444,0.146856,0.044963
7,Random Forest,0.792453,0.792453,0.792453,0.792453,0.620833,0.23042,0.086013
8,Gradient Boosting,0.773585,0.773585,0.773585,0.773585,0.558333,0.116667,0.048767
9,Extra Trees,0.754717,0.754717,0.754717,0.754717,0.598611,0.174095,0.081233



Results after correcting label errors from the entire dataset (applicable for datasets with binary classes only):
XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.740741,0.740741,0.740741,0.740741,0.641304,0.229095,0.004281
1,K-Nearest Neighbours,0.740741,0.740741,0.740741,0.740741,0.641304,0.229095,0.010302
2,Support Vector Machine,0.759259,0.759259,0.759259,0.759259,0.548913,0.09325,0.010634
3,Naive Bayes Classifier,0.814815,0.814815,0.814815,0.814815,0.839674,0.528524,0.006883
4,Logistic Regression,0.740741,0.740741,0.740741,0.740741,0.538043,0.069584,0.005008
5,Multi Layer Perceptron,0.814815,0.814815,0.814815,0.814815,0.581522,0.184302,0.735366
6,AdaBoost Classifier,0.796296,0.796296,0.796296,0.796296,0.570652,0.149445,0.048611
7,Random Forest,0.796296,0.796296,0.796296,0.796296,0.622283,0.233126,0.09498
8,Gradient Boosting,0.777778,0.777778,0.777778,0.777778,0.559783,0.119565,0.047002
9,Extra Trees,0.777778,0.777778,0.777778,0.777778,0.611413,0.203781,0.071258


# 13. Primary Tumor Dataset

In [146]:
# Loading the dataset
df = pd.read_csv("datasets/primary-tumor.csv")

In [147]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,class,age,sex,histologic-type,degree-of-diffe,bone,bone-marrow,lung,pleura,peritoneum,liver,brain,skin,neck,supraclavicular,axillar,mediastinum,abdominal
0,1,1,1,?,3,2,2,1,2,2,2,2,2,2,2,2,2,2
1,1,1,1,?,3,2,2,2,2,2,1,2,2,2,1,2,1,2
2,1,1,2,2,3,1,2,2,2,2,2,2,2,2,2,2,1,2
3,1,1,2,?,3,1,2,1,1,2,2,2,2,2,2,2,1,2
4,1,1,2,?,3,1,2,1,1,2,2,2,2,2,2,2,1,2


In [148]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(339, 18)

In [149]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [150]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

1     84
5     39
18    29
11    28
14    24
22    24
2     20
12    16
7     14
4     14
17    10
3      9
13     7
8      6
19     6
10     2
15     2
20     2
6      1
16     1
21     1
Name: class, dtype: int64

In [151]:
# Removing classes having <= 4 instances
df = remove_class(df,'class')

In [152]:
# Encoding the attributes of the dataset
df = encoding(df)

In [153]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [154]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [155]:
# Handing class imbalance problem using SMOTEN
X, y = oversampling(X,y)

In [156]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [157]:
# Classification and model evaluation
classification_evaluation(X_train, X_test, y_train, y_test)[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.809524,0.809524,0.809524,0.809524,0.917378,0.796828,0.006975
1,K-Nearest Neighbours,0.825397,0.825397,0.825397,0.825397,0.956856,0.814383,0.016978
2,Support Vector Machine,0.821429,0.821429,0.821429,0.821429,0.978645,0.809771,0.155249
3,Naive Bayes Classifier,0.757937,0.757937,0.757937,0.757937,0.962429,0.742741,0.008839
4,Logistic Regression,0.742063,0.742063,0.742063,0.742063,0.969574,0.725571,0.079122
5,Multi Layer Perceptron,0.821429,0.821429,0.821429,0.821429,0.97785,0.809488,3.223401
6,AdaBoost Classifier,0.313492,0.313492,0.313492,0.313492,0.751912,0.293398,0.077996
7,Random Forest,0.825397,0.825397,0.825397,0.825397,0.975174,0.813944,0.12674
8,Gradient Boosting,0.829365,0.829365,0.829365,0.829365,0.973664,0.818096,0.876444
9,Extra Trees,0.829365,0.829365,0.829365,0.829365,0.963564,0.818127,0.081112


In [158]:
# Removing dataset label errors and evaluating classifiers' performance
dataset_error(X, y, X_train, X_test, y_train, y_test)

********* Count-Based Classifier *********

Cleanlab found 242 potential label errors

Results after removing label errors from the entire dataset:


Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.884058,0.884058,0.884058,0.884058,0.941283,0.876054,0.007254
1,K-Nearest Neighbours,0.89372,0.89372,0.89372,0.89372,0.974319,0.886607,0.030069
2,Support Vector Machine,0.908213,0.908213,0.908213,0.908213,0.990458,0.901845,0.112624
3,Naive Bayes Classifier,0.874396,0.874396,0.874396,0.874396,0.992114,0.865277,0.021161
4,Logistic Regression,0.850242,0.850242,0.850242,0.850242,0.990124,0.839783,0.097211
5,Multi Layer Perceptron,0.898551,0.898551,0.898551,0.898551,0.991866,0.891928,1.76562
6,AdaBoost Classifier,0.299517,0.299517,0.299517,0.299517,0.733648,0.30309,0.068519
7,Random Forest,0.917874,0.917874,0.917874,0.917874,0.983942,0.912182,0.115372
8,Gradient Boosting,0.89372,0.89372,0.89372,0.89372,0.99116,0.886481,0.774908
9,Extra Trees,0.913043,0.913043,0.913043,0.913043,0.976147,0.907072,0.06586


# 14. Chess (King-Rook vs. King-Pawn) Dataset

In [159]:
# Loading the dataset
df = pd.read_csv("datasets/kr-vs-kp.csv")

In [160]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,bkblk,bknwy,bkon8,bkona,bkspr,bkxbq,bkxcr,bkxwp,blxwp,bxqsq,...,spcop,stlmt,thrsk,wkcti,wkna8,wknck,wkovl,wkpos,wtoeg,class
0,f,f,f,f,f,f,f,f,f,f,...,f,f,f,f,f,f,t,t,n,won
1,f,f,f,f,t,f,f,f,f,f,...,f,f,f,f,f,f,t,t,n,won
2,f,f,f,f,t,f,t,f,f,f,...,f,f,f,f,f,f,t,t,n,won
3,f,f,f,f,f,f,f,f,t,f,...,f,f,f,f,f,f,t,t,n,won
4,f,f,f,f,f,f,f,f,f,f,...,f,f,f,f,f,f,t,t,n,won


In [161]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(3196, 37)

In [162]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [163]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

won      1669
nowin    1527
Name: class, dtype: int64

In [164]:
# Encoding the attributes of the dataset
df = encoding(df)

In [165]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [166]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [167]:
# Handing class imbalance problem using SMOTEN
X, y = oversampling(X,y)

In [168]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [169]:
# Classification and model evaluation
classification_evaluation(X_train, X_test, y_train, y_test)[0]

XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.995509,0.995509,0.995509,0.995509,0.995476,0.991018,0.006792
1,K-Nearest Neighbours,0.958084,0.958084,0.958084,0.958084,0.958003,0.916137,0.027342
2,Support Vector Machine,0.964072,0.964072,0.964072,0.964072,0.964182,0.928197,0.729221
3,Naive Bayes Classifier,0.863772,0.863772,0.863772,0.863772,0.863744,0.727446,0.006741
4,Logistic Regression,0.952096,0.952096,0.952096,0.952096,0.952263,0.904332,0.016159
5,Multi Layer Perceptron,0.995509,0.995509,0.995509,0.995509,0.995476,0.991018,1.417278
6,AdaBoost Classifier,0.964072,0.964072,0.964072,0.964072,0.964119,0.928138,0.101477
7,Random Forest,0.989521,0.989521,0.989521,0.989521,0.989422,0.979068,0.178689
8,Gradient Boosting,0.974551,0.974551,0.974551,0.974551,0.974885,0.949629,0.162552
9,Extra Trees,0.989521,0.989521,0.989521,0.989521,0.989422,0.979068,0.141038


In [170]:
# Removing dataset label errors and evaluating classifiers' performance
dataset_error(X, y, X_train, X_test, y_train, y_test)

********* Count-Based Classifier *********

Cleanlab found 539 potential label errors

Results after removing label errors from the entire dataset:
XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.998227,0.998227,0.998227,0.998227,0.99812,0.996448,0.006302
1,K-Nearest Neighbours,0.971631,0.971631,0.971631,0.971631,0.971136,0.943129,0.056144
2,Support Vector Machine,0.992908,0.992908,0.992908,0.992908,0.992885,0.98577,0.48102
3,Naive Bayes Classifier,0.891844,0.891844,0.891844,0.891844,0.892403,0.783783,0.006676
4,Logistic Regression,0.978723,0.978723,0.978723,0.978723,0.979058,0.957441,0.010715
5,Multi Layer Perceptron,0.994681,0.994681,0.994681,0.994681,0.994563,0.989331,2.57694
6,AdaBoost Classifier,0.978723,0.978723,0.978723,0.978723,0.979058,0.957441,0.097398
7,Random Forest,0.996454,0.996454,0.996454,0.996454,0.996241,0.992907,0.176058
8,Gradient Boosting,0.994681,0.994681,0.994681,0.994681,0.994563,0.989331,0.136573
9,Extra Trees,0.998227,0.998227,0.998227,0.998227,0.99812,0.996448,0.112736



Results after correcting label errors from the entire dataset (applicable for datasets with binary classes only):
XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.928144,0.928144,0.928144,0.928144,0.92864,0.855985,0.007865
1,K-Nearest Neighbours,0.91018,0.91018,0.91018,0.91018,0.909899,0.819443,0.029067
2,Support Vector Machine,0.944611,0.944611,0.944611,0.944611,0.944862,0.888823,0.802316
3,Naive Bayes Classifier,0.841317,0.841317,0.841317,0.841317,0.841776,0.68215,0.006759
4,Logistic Regression,0.937126,0.937126,0.937126,0.937126,0.936546,0.873502,0.011872
5,Multi Layer Perceptron,0.934132,0.934132,0.934132,0.934132,0.933986,0.867597,8.233873
6,AdaBoost Classifier,0.947605,0.947605,0.947605,0.947605,0.946746,0.894602,0.116235
7,Random Forest,0.938623,0.938623,0.938623,0.938623,0.93884,0.876792,0.204336
8,Gradient Boosting,0.950599,0.950599,0.950599,0.950599,0.951109,0.900997,0.169392
9,Extra Trees,0.944611,0.944611,0.944611,0.944611,0.945087,0.88897,0.161418


# 15. Lymphography Dataset

In [171]:
# Loading the dataset
df = pd.read_csv("datasets/lymphography.csv")

In [172]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,class,lymphatics,block of affere,bl. of lymph. c,bl. of lymph. s,by pass,extravasates,regeneration of,early uptake in,lym.nodes dimin,lym.nodes enlar,changes in lym.,defect in node,changes in node,changes in stru,special forms,dislocation of,exclusion of no,no. of nodes in
0,3,4,2,1,1,1,1,1,2,1,2,2,2,4,8,1,1,2,2
1,2,3,2,1,1,2,2,1,2,1,3,3,2,3,4,2,2,2,2
2,3,3,2,2,2,2,2,2,2,1,4,3,3,4,8,3,2,2,7
3,3,3,1,1,1,1,2,1,2,1,3,3,4,4,4,3,1,2,6
4,2,3,1,1,1,1,1,1,1,1,2,2,4,3,5,1,2,2,1


In [173]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(148, 19)

In [174]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [175]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

2    81
3    61
4     4
1     2
Name: class, dtype: int64

In [176]:
# Removing classes having <= 4 instances
df = remove_class(df,'class')

In [177]:
# Encoding the attributes of the dataset
df = encoding(df)

In [178]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [179]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [180]:
# Handing class imbalance problem using SMOTEN
X, y = oversampling(X,y)

In [181]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [182]:
# Classification and model evaluation
classification_evaluation(X_train, X_test, y_train, y_test)[0]

XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.878788,0.878788,0.878788,0.878788,0.878676,0.757353,0.004422
1,K-Nearest Neighbours,0.818182,0.818182,0.818182,0.818182,0.821691,0.658062,0.008083
2,Support Vector Machine,0.909091,0.909091,0.909091,0.909091,0.908088,0.819194,0.007728
3,Naive Bayes Classifier,0.848485,0.848485,0.848485,0.848485,0.847426,0.697422,0.005268
4,Logistic Regression,0.818182,0.818182,0.818182,0.818182,0.819853,0.642071,0.005683
5,Multi Layer Perceptron,0.818182,0.818182,0.818182,0.818182,0.821691,0.658062,0.316103
6,AdaBoost Classifier,0.818182,0.818182,0.818182,0.818182,0.819853,0.642071,0.046038
7,Random Forest,0.878788,0.878788,0.878788,0.878788,0.878676,0.757353,0.085003
8,Gradient Boosting,0.848485,0.848485,0.848485,0.848485,0.847426,0.697422,0.041707
9,Extra Trees,0.878788,0.878788,0.878788,0.878788,0.878676,0.757353,0.073548


In [183]:
# Removing dataset label errors and evaluating classifiers' performance
dataset_error(X, y, X_train, X_test, y_train, y_test)

********* Count-Based Classifier *********

Cleanlab found 10 potential label errors

Results after removing label errors from the entire dataset:
XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.870968,0.870968,0.870968,0.870968,0.872917,0.748961,0.004006
1,K-Nearest Neighbours,0.806452,0.806452,0.806452,0.806452,0.80625,0.6125,0.039968
2,Support Vector Machine,0.903226,0.903226,0.903226,0.903226,0.902083,0.807538,0.020667
3,Naive Bayes Classifier,0.903226,0.903226,0.903226,0.903226,0.90625,0.822851,0.010286
4,Logistic Regression,0.870968,0.870968,0.870968,0.870968,0.872917,0.748961,0.019153
5,Multi Layer Perceptron,0.935484,0.935484,0.935484,0.935484,0.933333,0.877707,0.572779
6,AdaBoost Classifier,0.935484,0.935484,0.935484,0.935484,0.935417,0.870833,0.054498
7,Random Forest,0.967742,0.967742,0.967742,0.967742,0.96875,0.9375,0.080741
8,Gradient Boosting,0.935484,0.935484,0.935484,0.935484,0.9375,0.878669,0.04195
9,Extra Trees,0.967742,0.967742,0.967742,0.967742,0.96875,0.9375,0.065272



Results after correcting label errors from the entire dataset (applicable for datasets with binary classes only):
XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.939394,0.939394,0.939394,0.939394,0.944444,0.885615,0.004292
1,K-Nearest Neighbours,0.757576,0.757576,0.757576,0.757576,0.761111,0.520299,0.008695
2,Support Vector Machine,0.818182,0.818182,0.818182,0.818182,0.822222,0.642071,0.007974
3,Naive Bayes Classifier,0.878788,0.878788,0.878788,0.878788,0.883333,0.763843,0.004567
4,Logistic Regression,0.969697,0.969697,0.969697,0.969697,0.972222,0.940966,0.006161
5,Multi Layer Perceptron,1.0,1.0,1.0,1.0,1.0,1.0,0.333109
6,AdaBoost Classifier,0.909091,0.909091,0.909091,0.909091,0.911111,0.819194,0.047108
7,Random Forest,0.969697,0.969697,0.969697,0.969697,0.972222,0.940966,0.089579
8,Gradient Boosting,0.969697,0.969697,0.969697,0.969697,0.972222,0.940966,0.042416
9,Extra Trees,0.969697,0.969697,0.969697,0.969697,0.972222,0.940966,0.068068


# 16. Connect-4 Dataset

In [184]:
# Loading the dataset
df = pd.read_csv("datasets/connect-4.csv")

In [185]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,a1,a2,a3,a4,a5,a6,b1,b2,b3,b4,...,f4,f5,f6,g1,g2,g3,g4,g5,g6,class
0,b,b,b,b,b,b,b,b,b,b,...,b,b,b,b,b,b,b,b,b,win
1,b,b,b,b,b,b,b,b,b,b,...,b,b,b,b,b,b,b,b,b,win
2,b,b,b,b,b,b,o,b,b,b,...,b,b,b,b,b,b,b,b,b,win
3,b,b,b,b,b,b,b,b,b,b,...,b,b,b,b,b,b,b,b,b,win
4,o,b,b,b,b,b,b,b,b,b,...,b,b,b,b,b,b,b,b,b,win


In [186]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(67557, 43)

In [187]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [188]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

win     44473
loss    16635
draw     6449
Name: class, dtype: int64

In [189]:
# Encoding the attributes of the dataset
df = encoding(df)

In [190]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [191]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [192]:
# Handing class imbalance problem using SMOTEN
X, y = oversampling(X,y)

In [193]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Classification and model evaluation
classification_evaluation(X_train, X_test, y_train, y_test)[0]

In [None]:
# Removing dataset label errors and evaluating classifiers' performance
dataset_error(X, y, X_train, X_test, y_train, y_test)

# 17. Hayes-Roth Dataset

In [2]:
# Loading the dataset
df = pd.read_csv("datasets/hayes-roth.csv")

In [3]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,hobby,age,educational_level,marital_status,class
0,2,1,1,2,1
1,2,1,3,2,2
2,3,1,4,1,3
3,2,4,2,2,3
4,1,1,3,4,3


In [4]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(160, 5)

In [5]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [6]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

1    65
2    64
3    31
Name: class, dtype: int64

In [7]:
# Encoding the attributes of the dataset
df = encoding(df)

In [8]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [9]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [10]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Classification and model evaluation
classification_evaluation(X_train, X_test, y_train, y_test)[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.875,0.875,0.875,0.875,0.945703,0.797468,0.016221
1,K-Nearest Neighbours,0.6875,0.6875,0.6875,0.6875,0.772004,0.484761,0.00809
2,Support Vector Machine,0.8125,0.8125,0.8125,0.8125,0.881972,0.729003,0.011442
3,Naive Bayes Classifier,0.84375,0.84375,0.84375,0.84375,0.966276,0.767773,0.005071
4,Logistic Regression,0.40625,0.40625,0.40625,0.40625,0.689999,0.039171,0.007005
5,Multi Layer Perceptron,0.75,0.75,0.75,0.75,0.898838,0.596097,0.173046
6,AdaBoost Classifier,0.5,0.5,0.5,0.5,0.774621,0.486427,0.027057
7,Random Forest,0.8125,0.8125,0.8125,0.8125,0.947869,0.725057,0.052923
8,Gradient Boosting,0.84375,0.84375,0.84375,0.84375,0.976054,0.764506,0.073714
9,Extra Trees,0.84375,0.84375,0.84375,0.84375,0.930741,0.750416,0.042452


In [12]:
# Removing dataset label errors and evaluating classifiers' performance
dataset_error(X, y, X_train, X_test, y_train, y_test)

********* Count-Based Classifier *********

Cleanlab found 27 potential label errors

Results after removing label errors from the entire dataset:


Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.925926,0.925926,0.925926,0.925926,0.954541,0.881658,0.00416
1,K-Nearest Neighbours,0.703704,0.703704,0.703704,0.703704,0.82409,0.494576,0.004718
2,Support Vector Machine,0.814815,0.814815,0.814815,0.814815,0.964853,0.693881,0.006923
3,Naive Bayes Classifier,0.925926,0.925926,0.925926,0.925926,1.0,0.883795,0.003949
4,Logistic Regression,0.518519,0.518519,0.518519,0.518519,0.771189,0.167543,0.004906
5,Multi Layer Perceptron,0.851852,0.851852,0.851852,0.851852,0.869048,0.757585,0.145482
6,AdaBoost Classifier,0.481481,0.481481,0.481481,0.481481,0.690126,0.391528,0.029201
7,Random Forest,0.962963,0.962963,0.962963,0.962963,1.0,0.941714,0.054492
8,Gradient Boosting,0.962963,0.962963,0.962963,0.962963,0.979092,0.940624,0.068298
9,Extra Trees,0.962963,0.962963,0.962963,0.962963,0.998104,0.940948,0.041535


# 18. Lung Cancer Prediction Dataset

In [13]:
# Loading the dataset
df = pd.read_csv("datasets/cancer-patients.csv")

In [14]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,index,Patient Id,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,0,P1,33,1,2,4,5,4,3,2,...,3,4,2,2,3,1,2,3,4,Low
1,1,P10,17,1,3,1,5,3,4,2,...,1,3,7,8,6,2,1,7,2,Medium
2,2,P100,35,1,4,5,6,5,5,4,...,8,7,9,2,1,4,6,7,2,High
3,3,P1000,37,1,7,7,7,7,6,7,...,4,2,3,1,4,5,6,7,5,High
4,4,P101,46,1,6,8,7,7,7,6,...,3,2,4,1,4,2,4,2,3,High


In [15]:
# Dropping unwanted columns
df.drop(['index','Patient Id','Age'], axis=1, inplace=True)

In [16]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,Balanced Diet,Obesity,Smoking,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,1,2,4,5,4,3,2,2,4,3,...,3,4,2,2,3,1,2,3,4,Low
1,1,3,1,5,3,4,2,2,2,2,...,1,3,7,8,6,2,1,7,2,Medium
2,1,4,5,6,5,5,4,6,7,2,...,8,7,9,2,1,4,6,7,2,High
3,1,7,7,7,7,6,7,7,7,7,...,4,2,3,1,4,5,6,7,5,High
4,1,6,8,7,7,7,6,7,7,8,...,3,2,4,1,4,2,4,2,3,High


In [17]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(1000, 23)

In [18]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [19]:
# Checking the instance counts of the target attribute
df['Level'].value_counts()

High      365
Medium    332
Low       303
Name: Level, dtype: int64

In [20]:
# Encoding the attributes of the dataset
df = encoding(df)

In [21]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [22]:
# Separating dependent and independent variables
X, y = separating(df, 'Level')

In [23]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
# Classification and model evaluation
classification_evaluation(X_train, X_test, y_train, y_test)[0]

Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,1.0,1.0,1.0,1.0,1.0,1.0,0.016866
1,K-Nearest Neighbours,1.0,1.0,1.0,1.0,1.0,1.0,0.069416
2,Support Vector Machine,1.0,1.0,1.0,1.0,1.0,1.0,0.036167
3,Naive Bayes Classifier,1.0,1.0,1.0,1.0,1.0,1.0,0.005315
4,Logistic Regression,1.0,1.0,1.0,1.0,1.0,1.0,0.059677
5,Multi Layer Perceptron,1.0,1.0,1.0,1.0,1.0,1.0,0.551696
6,AdaBoost Classifier,0.725,0.725,0.725,0.725,0.870769,0.658113,0.075457
7,Random Forest,1.0,1.0,1.0,1.0,1.0,1.0,0.094201
8,Gradient Boosting,1.0,1.0,1.0,1.0,1.0,1.0,0.272439
9,Extra Trees,1.0,1.0,1.0,1.0,1.0,1.0,0.054228


In [25]:
# Removing dataset label errors and evaluating classifiers' performance
dataset_error(X, y, X_train, X_test, y_train, y_test)

********* Count-Based Classifier *********

Cleanlab found 0 potential label errors

Results after removing label errors from the entire dataset:


Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,1.0,1.0,1.0,1.0,1.0,1.0,0.005216
1,K-Nearest Neighbours,1.0,1.0,1.0,1.0,1.0,1.0,0.027061
2,Support Vector Machine,1.0,1.0,1.0,1.0,1.0,1.0,0.033943
3,Naive Bayes Classifier,1.0,1.0,1.0,1.0,1.0,1.0,0.006946
4,Logistic Regression,1.0,1.0,1.0,1.0,1.0,1.0,0.072649
5,Multi Layer Perceptron,1.0,1.0,1.0,1.0,1.0,1.0,0.582878
6,AdaBoost Classifier,0.725,0.725,0.725,0.725,0.870769,0.658113,0.058926
7,Random Forest,1.0,1.0,1.0,1.0,1.0,1.0,0.128601
8,Gradient Boosting,1.0,1.0,1.0,1.0,1.0,1.0,0.256134
9,Extra Trees,1.0,1.0,1.0,1.0,1.0,1.0,0.05585


# 19. Phishing Website Dataset

In [26]:
# Loading the dataset
df = pd.read_csv("datasets/phishing.csv")

In [27]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,Index,UsingIP,LongURL,ShortURL,Symbol@,Redirecting//,PrefixSuffix-,SubDomains,HTTPS,DomainRegLen,...,UsingPopupWindow,IframeRedirection,AgeofDomain,DNSRecording,WebsiteTraffic,PageRank,GoogleIndex,LinksPointingToPage,StatsReport,class
0,0,1,1,1,1,1,-1,0,1,-1,...,1,1,-1,-1,0,-1,1,1,1,-1
1,1,1,0,1,1,1,-1,-1,-1,-1,...,1,1,1,-1,1,-1,1,0,-1,-1
2,2,1,0,1,1,1,-1,-1,-1,1,...,1,1,-1,-1,1,-1,1,-1,1,-1
3,3,1,0,-1,1,1,-1,1,1,-1,...,-1,1,-1,-1,0,-1,1,1,1,1
4,4,-1,0,-1,1,-1,-1,1,1,-1,...,1,1,1,1,1,-1,1,-1,-1,1


In [28]:
# Dropping unwanted columns
df.drop(['Index'], axis=1, inplace=True)

In [29]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,UsingIP,LongURL,ShortURL,Symbol@,Redirecting//,PrefixSuffix-,SubDomains,HTTPS,DomainRegLen,Favicon,...,UsingPopupWindow,IframeRedirection,AgeofDomain,DNSRecording,WebsiteTraffic,PageRank,GoogleIndex,LinksPointingToPage,StatsReport,class
0,1,1,1,1,1,-1,0,1,-1,1,...,1,1,-1,-1,0,-1,1,1,1,-1
1,1,0,1,1,1,-1,-1,-1,-1,1,...,1,1,1,-1,1,-1,1,0,-1,-1
2,1,0,1,1,1,-1,-1,-1,1,1,...,1,1,-1,-1,1,-1,1,-1,1,-1
3,1,0,-1,1,1,-1,1,1,-1,1,...,-1,1,-1,-1,0,-1,1,1,1,1
4,-1,0,-1,1,-1,-1,1,1,-1,1,...,1,1,1,1,1,-1,1,-1,-1,1


In [30]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(11054, 31)

In [31]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [32]:
# Checking the instance counts of the target attribute
df['class'].value_counts()

 1    6157
-1    4897
Name: class, dtype: int64

In [33]:
# Encoding the attributes of the dataset
df = encoding(df)

In [34]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [35]:
# Separating dependent and independent variables
X, y = separating(df, 'class')

In [36]:
# Handing class imbalance problem using SMOTEN
X, y = oversampling(X,y)

In [37]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [38]:
# Classification and model evaluation
classification_evaluation(X_train, X_test, y_train, y_test)[0]

XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.963865,0.963865,0.963865,0.963865,0.96393,0.927758,0.018838
1,K-Nearest Neighbours,0.957775,0.957775,0.957775,0.957775,0.957912,0.915698,0.096088
2,Support Vector Machine,0.952903,0.952903,0.952903,0.952903,0.953106,0.906126,5.694344
3,Naive Bayes Classifier,0.926106,0.926106,0.926106,0.926106,0.926478,0.853233,0.01297
4,Logistic Regression,0.924482,0.924482,0.924482,0.924482,0.924826,0.84985,0.083032
5,Multi Layer Perceptron,0.967113,0.967113,0.967113,0.967113,0.967381,0.934763,8.868687
6,AdaBoost Classifier,0.936257,0.936257,0.936257,0.936257,0.93653,0.873084,0.27751
7,Random Forest,0.973203,0.973203,0.973203,0.973203,0.973291,0.946463,0.339632
8,Gradient Boosting,0.942347,0.942347,0.942347,0.942347,0.942561,0.885052,0.567319
9,Extra Trees,0.975639,0.975639,0.975639,0.975639,0.975673,0.951281,0.439998


In [39]:
# Removing dataset label errors and evaluating classifiers' performance
dataset_error(X, y, X_train, X_test, y_train, y_test)

********* Count-Based Classifier *********

Cleanlab found 146 potential label errors

Results after removing label errors from the entire dataset:
XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.967888,0.967888,0.967888,0.967888,0.96791,0.935784,0.017058
1,K-Nearest Neighbours,0.966241,0.966241,0.966241,0.966241,0.966322,0.932613,0.303239
2,Support Vector Machine,0.965418,0.965418,0.965418,0.965418,0.965507,0.930994,4.810848
3,Naive Bayes Classifier,0.940716,0.940716,0.940716,0.940716,0.940922,0.882218,0.013063
4,Logistic Regression,0.937834,0.937834,0.937834,0.937834,0.938044,0.876482,0.06305
5,Multi Layer Perceptron,0.974475,0.974475,0.974475,0.974475,0.974591,0.949209,8.956712
6,AdaBoost Classifier,0.942775,0.942775,0.942775,0.942775,0.942959,0.886189,0.271187
7,Random Forest,0.981062,0.981062,0.981062,0.981062,0.981094,0.962144,0.330928
8,Gradient Boosting,0.957184,0.957184,0.957184,0.957184,0.957315,0.9147,0.565914
9,Extra Trees,0.982709,0.982709,0.982709,0.982709,0.982716,0.965416,0.418378



Results after correcting label errors from the entire dataset (applicable for datasets with binary classes only):
XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.970361,0.970361,0.970361,0.970361,0.970368,0.940731,0.016933
1,K-Nearest Neighbours,0.966707,0.966707,0.966707,0.966707,0.966739,0.933567,0.101107
2,Support Vector Machine,0.963865,0.963865,0.963865,0.963865,0.963905,0.927958,4.909204
3,Naive Bayes Classifier,0.940723,0.940723,0.940723,0.940723,0.940799,0.882249,0.013029
4,Logistic Regression,0.937881,0.937881,0.937881,0.937881,0.937958,0.876591,0.061722
5,Multi Layer Perceptron,0.974015,0.974015,0.974015,0.974015,0.974042,0.948134,9.248404
6,AdaBoost Classifier,0.945189,0.945189,0.945189,0.945189,0.945267,0.891219,0.272462
7,Random Forest,0.978482,0.978482,0.978482,0.978482,0.978506,0.957056,0.331127
8,Gradient Boosting,0.958587,0.958587,0.958587,0.958587,0.958634,0.917491,0.574326
9,Extra Trees,0.980512,0.980512,0.980512,0.980512,0.980526,0.961055,0.429963


# 20. Monkey-Pox Patients Dataset

In [40]:
# Loading the dataset
df = pd.read_csv("datasets/monkey-pox.csv")

In [41]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,Patient_ID,Systemic Illness,Rectal Pain,Sore Throat,Penile Oedema,Oral Lesions,Solitary Lesion,Swollen Tonsils,HIV Infection,Sexually Transmitted Infection,MonkeyPox
0,P0,,False,True,True,True,False,True,False,False,Negative
1,P1,Fever,True,False,True,True,False,False,True,False,Positive
2,P2,Fever,False,True,True,False,False,False,True,False,Positive
3,P3,,True,False,False,False,True,True,True,False,Positive
4,P4,Swollen Lymph Nodes,True,True,True,False,False,True,True,False,Positive


In [42]:
# Dropping unwanted columns
df.drop(['Patient_ID'], axis=1, inplace=True)

In [43]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,Systemic Illness,Rectal Pain,Sore Throat,Penile Oedema,Oral Lesions,Solitary Lesion,Swollen Tonsils,HIV Infection,Sexually Transmitted Infection,MonkeyPox
0,,False,True,True,True,False,True,False,False,Negative
1,Fever,True,False,True,True,False,False,True,False,Positive
2,Fever,False,True,True,False,False,False,True,False,Positive
3,,True,False,False,False,True,True,True,False,Positive
4,Swollen Lymph Nodes,True,True,True,False,False,True,True,False,Positive


In [44]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(25000, 10)

In [45]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [46]:
# Checking the instance counts of the target attribute
df['MonkeyPox'].value_counts()

Positive    15909
Negative     9091
Name: MonkeyPox, dtype: int64

In [47]:
# Encoding the attributes of the dataset
df = encoding(df)

In [48]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [49]:
# Separating dependent and independent variables
X, y = separating(df, 'MonkeyPox')

In [50]:
# Handing class imbalance problem using SMOTEN
X, y = oversampling(X,y)

In [51]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [52]:
# Classification and model evaluation
classification_evaluation(X_train, X_test, y_train, y_test)[0]

XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.64252,0.64252,0.64252,0.64252,0.642575,0.285284,0.029387
1,K-Nearest Neighbours,0.618479,0.618479,0.618479,0.618479,0.618437,0.236947,0.366938
2,Support Vector Machine,0.641106,0.641106,0.641106,0.641106,0.640931,0.283343,141.562319
3,Naive Bayes Classifier,0.639063,0.639063,0.639063,0.639063,0.63906,0.278119,0.020606
4,Logistic Regression,0.612508,0.612508,0.612508,0.612508,0.612481,0.224991,0.040963
5,Multi Layer Perceptron,0.641106,0.641106,0.641106,0.641106,0.640964,0.282909,6.869932
6,AdaBoost Classifier,0.639063,0.639063,0.639063,0.639063,0.639075,0.278153,0.417087
7,Random Forest,0.637492,0.637492,0.637492,0.637492,0.637472,0.274964,0.795729
8,Gradient Boosting,0.641263,0.641263,0.641263,0.641263,0.641113,0.283323,0.919087
9,Extra Trees,0.64252,0.64252,0.64252,0.64252,0.642575,0.285284,0.967882


In [53]:
# Removing dataset label errors and evaluating classifiers' performance
dataset_error(X, y, X_train, X_test, y_train, y_test)

********* Count-Based Classifier *********

Cleanlab found 10460 potential label errors

Results after removing label errors from the entire dataset:
XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.954288,0.954288,0.954288,0.954288,0.954276,0.908588,0.017054
1,K-Nearest Neighbours,0.952168,0.952168,0.952168,0.952168,0.952168,0.904334,0.195068
2,Support Vector Machine,0.953346,0.953346,0.953346,0.953346,0.953305,0.906844,12.027804
3,Naive Bayes Classifier,0.946041,0.946041,0.946041,0.946041,0.945964,0.892652,0.014319
4,Logistic Regression,0.845193,0.845193,0.845193,0.845193,0.845205,0.690407,0.036124
5,Multi Layer Perceptron,0.952168,0.952168,0.952168,0.952168,0.952004,0.906984,4.108453
6,AdaBoost Classifier,0.95311,0.95311,0.95311,0.95311,0.9531,0.906225,0.298002
7,Random Forest,0.951461,0.951461,0.951461,0.951461,0.951418,0.903089,0.446862
8,Gradient Boosting,0.952639,0.952639,0.952639,0.952639,0.952641,0.905278,0.569475
9,Extra Trees,0.954288,0.954288,0.954288,0.954288,0.954276,0.908588,0.58569



Results after correcting label errors from the entire dataset (applicable for datasets with binary classes only):
XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.969987,0.969987,0.969987,0.969987,0.96998,0.939994,0.02408
1,K-Nearest Neighbours,0.969359,0.969359,0.969359,0.969359,0.969352,0.938733,0.373881
2,Support Vector Machine,0.96873,0.96873,0.96873,0.96873,0.968717,0.937522,18.846875
3,Naive Bayes Classifier,0.964645,0.964645,0.964645,0.964645,0.964646,0.92929,0.018852
4,Logistic Regression,0.847737,0.847737,0.847737,0.847737,0.84773,0.695481,0.036517
5,Multi Layer Perceptron,0.968102,0.968102,0.968102,0.968102,0.968076,0.93646,5.996348
6,AdaBoost Classifier,0.969045,0.969045,0.969045,0.969045,0.969035,0.938122,0.404039
7,Random Forest,0.969045,0.969045,0.969045,0.969045,0.969032,0.938151,0.646648
8,Gradient Boosting,0.965116,0.965116,0.965116,0.965116,0.96512,0.930237,0.907592
9,Extra Trees,0.969987,0.969987,0.969987,0.969987,0.96998,0.939994,0.789484


# 21. Animal Condition Classification Dataset

In [54]:
# Loading the dataset
df = pd.read_csv("datasets/animal-condition.csv")

In [55]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,AnimalName,symptoms1,symptoms2,symptoms3,symptoms4,symptoms5,Dangerous
0,Dog,Fever,Diarrhea,Vomiting,Weight loss,Dehydration,Yes
1,Dog,Fever,Diarrhea,Coughing,Tiredness,Pains,Yes
2,Dog,Fever,Diarrhea,Coughing,Vomiting,Anorexia,Yes
3,Dog,Fever,Difficulty breathing,Coughing,Lethargy,Sneezing,Yes
4,Dog,Fever,Diarrhea,Coughing,Lethargy,Blue Eye,Yes


In [56]:
# Checking the unique values present in the dataset for the attribute 'AnimalName'
df['AnimalName'].value_counts()

Buffaloes            129
Sheep                110
Pig                   63
Fowl                  62
Elephant              59
Duck                  56
Deer                  38
Donkey                38
Birds                 37
cat                   36
Dog                   34
Monkey                28
Goat                  26
Cattle                21
Hamster               18
Tiger                 17
Lion                  16
Rabbit                11
Horse                 10
Chicken                9
Fox                    7
Other Birds            6
horse                  5
chicken                4
Turtle                 4
Pigs                   3
cow                    3
donkey                 2
Goats                  2
White-tailed deer      1
Hyaenas                1
Wolves                 1
Dogs                   1
Fox                    1
Moos                   1
Reindeer               1
mammal                 1
Sika deer              1
cattle                 1
Mule deer              1


In [57]:
# Fixing the values of the attribute 'AnimalName'
df['AnimalName'] = [name.lower() for name in df['AnimalName']]
df['AnimalName'].replace({'black-tailed deer':'deer','white-tailed deer':'deer','mule deer':'deer','sika deer':'deer','reindeer':'deer','elk':'deer','wapiti':'deer','mules':'horse','other birds': 'birds','pigs':'pig', 'dogs': 'dog', 'goats': 'goat'}, inplace = True)

In [58]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,AnimalName,symptoms1,symptoms2,symptoms3,symptoms4,symptoms5,Dangerous
0,dog,Fever,Diarrhea,Vomiting,Weight loss,Dehydration,Yes
1,dog,Fever,Diarrhea,Coughing,Tiredness,Pains,Yes
2,dog,Fever,Diarrhea,Coughing,Vomiting,Anorexia,Yes
3,dog,Fever,Difficulty breathing,Coughing,Lethargy,Sneezing,Yes
4,dog,Fever,Diarrhea,Coughing,Lethargy,Blue Eye,Yes


In [59]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(871, 7)

In [60]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [61]:
# Checking the instance counts of the target attribute
df['Dangerous'].value_counts()

Yes    849
No      20
Name: Dangerous, dtype: int64

In [62]:
# Encoding the attributes of the dataset
df = encoding(df)

In [63]:
# Imputing the missing values if any by replacing it with the mode
df = imputation(df)

In [64]:
# Separating dependent and independent variables
X, y = separating(df, 'Dangerous')

In [65]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [66]:
# Classification and model evaluation
classification_evaluation(X_train, X_test, y_train, y_test)[0]

XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.988571,0.988571,0.988571,0.988571,0.74711,0.49422,0.004731
1,K-Nearest Neighbours,0.988571,0.988571,0.988571,0.988571,0.5,0.0,0.009422
2,Support Vector Machine,0.988571,0.988571,0.988571,0.988571,0.5,0.0,0.018719
3,Naive Bayes Classifier,0.971429,0.971429,0.971429,0.971429,0.738439,0.304254,0.003853
4,Logistic Regression,0.988571,0.988571,0.988571,0.988571,0.5,0.0,0.005822
5,Multi Layer Perceptron,0.988571,0.988571,0.988571,0.988571,0.74711,0.49422,0.531226
6,AdaBoost Classifier,0.994286,0.994286,0.994286,0.994286,0.75,0.705072,0.066565
7,Random Forest,0.988571,0.988571,0.988571,0.988571,0.5,0.0,0.109829
8,Gradient Boosting,0.988571,0.988571,0.988571,0.988571,0.5,0.0,0.095292
9,Extra Trees,0.994286,0.994286,0.994286,0.994286,0.75,0.705072,0.053025


In [67]:
# Removing dataset label errors and evaluating classifiers' performance
dataset_error(X, y, X_train, X_test, y_train, y_test)

********* Count-Based Classifier *********

Cleanlab found 3 potential label errors

Results after removing label errors from the entire dataset:
XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.988506,0.988506,0.988506,0.988506,0.99422,0.574003,0.004468
1,K-Nearest Neighbours,0.994253,0.994253,0.994253,0.994253,0.5,0.0,0.006953
2,Support Vector Machine,0.994253,0.994253,0.994253,0.994253,0.5,0.0,0.015752
3,Naive Bayes Classifier,0.977011,0.977011,0.977011,0.977011,0.988439,0.442013,0.003742
4,Logistic Regression,0.994253,0.994253,0.994253,0.994253,0.5,0.0,0.005697
5,Multi Layer Perceptron,0.994253,0.994253,0.994253,0.994253,0.5,0.0,0.020133
6,AdaBoost Classifier,0.982759,0.982759,0.982759,0.982759,0.49422,-0.008198,0.047244
7,Random Forest,0.994253,0.994253,0.994253,0.994253,0.5,0.0,0.117903
8,Gradient Boosting,0.994253,0.994253,0.994253,0.994253,0.5,0.0,0.103472
9,Extra Trees,1.0,1.0,1.0,1.0,1.0,1.0,0.049315



Results after correcting label errors from the entire dataset (applicable for datasets with binary classes only):
XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.982857,0.982857,0.982857,0.982857,0.991379,0.495671,0.004332
1,K-Nearest Neighbours,0.994286,0.994286,0.994286,0.994286,0.5,0.0,0.007159
2,Support Vector Machine,0.994286,0.994286,0.994286,0.994286,0.5,0.0,0.017398
3,Naive Bayes Classifier,0.977143,0.977143,0.977143,0.977143,0.988506,0.442043,0.003938
4,Logistic Regression,0.994286,0.994286,0.994286,0.994286,0.5,0.0,0.005819
5,Multi Layer Perceptron,1.0,1.0,1.0,1.0,1.0,1.0,0.418557
6,AdaBoost Classifier,0.988571,0.988571,0.988571,0.988571,0.497126,-0.005747,0.0576
7,Random Forest,0.994286,0.994286,0.994286,0.994286,0.5,0.0,0.098088
8,Gradient Boosting,0.994286,0.994286,0.994286,0.994286,0.5,0.0,0.098406
9,Extra Trees,1.0,1.0,1.0,1.0,1.0,1.0,0.049473


# 22. Android Malware Detection

In [68]:
# Loading the dataset
df = pd.read_csv("datasets/TUANDROMD.csv")

In [69]:
# Displaying the first five records of the dataset
df.head()

Unnamed: 0,ACCESS_ALL_DOWNLOADS,ACCESS_CACHE_FILESYSTEM,ACCESS_CHECKIN_PROPERTIES,ACCESS_COARSE_LOCATION,ACCESS_COARSE_UPDATES,ACCESS_FINE_LOCATION,ACCESS_LOCATION_EXTRA_COMMANDS,ACCESS_MOCK_LOCATION,ACCESS_MTK_MMHW,ACCESS_NETWORK_STATE,...,Landroid/telephony/TelephonyManager;->getLine1Number,Landroid/telephony/TelephonyManager;->getNetworkOperator,Landroid/telephony/TelephonyManager;->getNetworkOperatorName,Landroid/telephony/TelephonyManager;->getNetworkCountryIso,Landroid/telephony/TelephonyManager;->getSimOperator,Landroid/telephony/TelephonyManager;->getSimOperatorName,Landroid/telephony/TelephonyManager;->getSimCountryIso,Landroid/telephony/TelephonyManager;->getSimSerialNumber,Lorg/apache/http/impl/client/DefaultHttpClient;->execute,Label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,malware
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,malware
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,malware
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,malware
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,malware


In [70]:
# Displaying the shape of the dataset i.e. (no. of records, no. of attributes)
df.shape

(4465, 242)

In [71]:
# Converting '?' to NaN incase it is present in the dataset
df.replace('?', np.nan, inplace=True)

In [72]:
# Checking the instance counts of the target attribute
df['Label'].value_counts()

malware     3565
goodware     899
Name: Label, dtype: int64

In [73]:
# Encoding the attributes of the dataset
df = encoding(df)

In [74]:
# Imputing the missing values if any by using KNN Imputer
df = imputation(df)

In [75]:
# Separating dependent and independent variables
X, y = separating(df, 'Label')

In [76]:
# Handing class imbalance problem using SMOTEN
X, y = oversampling(X,y)

In [77]:
# Splitting the dataset in a 80:20 train-test split ratio
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [78]:
# Classification and model evaluation
classification_evaluation(X_train, X_test, y_train, y_test)[0]

XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.992992,0.992992,0.992992,0.992992,0.99307,0.98599,0.028129
1,K-Nearest Neighbours,0.991591,0.991591,0.991591,0.991591,0.991752,0.983234,0.100481
2,Support Vector Machine,0.992292,0.992292,0.992292,0.992292,0.99239,0.984597,1.950933
3,Naive Bayes Classifier,0.976174,0.976174,0.976174,0.976174,0.97649,0.952576,0.03454
4,Logistic Regression,0.990189,0.990189,0.990189,0.990189,0.990223,0.980366,0.058084
5,Multi Layer Perceptron,0.992292,0.992292,0.992292,0.992292,0.992348,0.98458,2.478873
6,AdaBoost Classifier,0.989488,0.989488,0.989488,0.989488,0.989585,0.978986,0.464181
7,Random Forest,0.993693,0.993693,0.993693,0.993693,0.993793,0.987402,0.214337
8,Gradient Boosting,0.991591,0.991591,0.991591,0.991591,0.99171,0.983205,0.802745
9,Extra Trees,0.994394,0.994394,0.994394,0.994394,0.994516,0.988815,0.263153


In [79]:
# Removing dataset label errors and evaluating classifiers' performance
dataset_error(X, y, X_train, X_test, y_train, y_test)

********* Count-Based Classifier *********

Cleanlab found 25 potential label errors

Results after removing label errors from the entire dataset:
Naive Bayes Classifier : index 1 is out of bounds for axis 1 with size 1
XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.990877,0.990877,0.990877,0.990877,0.99093,0.98175,0.026441
1,K-Nearest Neighbours,0.991579,0.991579,0.991579,0.991579,0.991734,0.983211,0.293223
2,Support Vector Machine,0.991579,0.991579,0.991579,0.991579,0.991653,0.983162,1.801097
3,Logistic Regression,0.991579,0.991579,0.991579,0.991579,0.991612,0.983149,0.08401
4,Multi Layer Perceptron,0.991579,0.991579,0.991579,0.991579,0.991612,0.983149,2.802242
5,AdaBoost Classifier,0.990877,0.990877,0.990877,0.990877,0.99093,0.98175,0.469574
6,Random Forest,0.994386,0.994386,0.994386,0.994386,0.994543,0.988829,0.208211
7,Gradient Boosting,0.990877,0.990877,0.990877,0.990877,0.990971,0.981767,0.794795
8,Extra Trees,0.993684,0.993684,0.993684,0.993684,0.99382,0.987409,0.251188
9,Custom Naive Bayes Classifier,0.976842,0.976842,0.976842,0.976842,0.977126,0.953885,9.441609



Results after correcting label errors from the entire dataset (applicable for datasets with binary classes only):
XGBoost : value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.


Unnamed: 0,Model,Accu,Reca,Prec,F1,AUC,MCC,TT
0,Decision Tree,0.989488,0.989488,0.989488,0.989488,0.989576,0.978989,0.025897
1,K-Nearest Neighbours,0.990189,0.990189,0.990189,0.990189,0.990374,0.980468,0.109738
2,Support Vector Machine,0.99089,0.99089,0.99089,0.99089,0.990979,0.981794,1.855897
3,Naive Bayes Classifier,0.976875,0.976875,0.976875,0.976875,0.977145,0.953952,0.033985
4,Logistic Regression,0.99089,0.99089,0.99089,0.99089,0.990941,0.981777,0.065086
5,Multi Layer Perceptron,0.990189,0.990189,0.990189,0.990189,0.99022,0.980369,3.984336
6,AdaBoost Classifier,0.99089,0.99089,0.99089,0.99089,0.990979,0.981794,0.39664
7,Random Forest,0.993693,0.993693,0.993693,0.993693,0.993823,0.987428,0.208959
8,Gradient Boosting,0.990189,0.990189,0.990189,0.990189,0.990297,0.980403,0.79556
9,Extra Trees,0.994394,0.994394,0.994394,0.994394,0.994543,0.988845,0.255583
