**Import Libraries**

In [223]:
import random
import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

**Import Data**

In [224]:
# Read from data files
df_mushroom = pd.read_csv(r'agaricus-lepiota.data', delimiter=',', index_col=None, header=None)
df_bank = pd.read_csv(r'bank.csv', delimiter=';', index_col=None, header=0)
df_car = pd.read_csv(r'car.data', delimiter=',', index_col=None, header=None)
df_zoo = pd.read_csv(r'zoo.data', delimiter=',', index_col=0, header=None)

df_list = [df_mushroom, df_bank, df_car, df_zoo]

**Preprocess Data**

In [225]:
## Remove rows with missing values
df_mushroom.drop(df_mushroom[df_mushroom == '?'], axis=0, inplace=True)

## Remove columns with constant values
for df in df_list:  
  df.drop(df.columns[df.nunique() == 1], axis=1, inplace=True) 

## Remove nan
for df in df_list:
  df.dropna(inplace = True, how = 'any')

## Move output variable to front of dataframe and label it y
df_mushroom.rename(columns={0: "y"}, inplace=True)

y_bank = df_bank.iloc[:,-1]
df_bank.drop(df_bank.columns[[-1]], axis=1, inplace=True)
df_bank.insert(0, "y",y_bank)

y_car = df_car.iloc[:,-1]
df_car.drop(df_car.columns[[-1]], axis=1, inplace=True)
df_car.insert(0, "y",y_car)

y_zoo = df_zoo.iloc[:,-1]
df_zoo.drop(df_zoo.columns[[-1]], axis=1, inplace=True)
df_zoo.insert(0, "y",y_zoo)

In [226]:
df_bank

Unnamed: 0,y,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,no,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown
1,no,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure
2,no,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure
3,no,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown
4,no,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4516,no,33,services,married,secondary,no,-333,yes,no,cellular,30,jul,329,5,-1,0,unknown
4517,no,57,self-employed,married,tertiary,yes,-3313,yes,yes,unknown,9,may,153,1,-1,0,unknown
4518,no,57,technician,married,secondary,no,295,no,no,cellular,19,aug,151,11,-1,0,unknown
4519,no,28,blue-collar,married,secondary,no,1137,no,no,cellular,6,feb,129,4,211,3,other


In [227]:
## One Hot Encode Categorical Attributes

# Create one hot encoder instance
ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)

# Define categorical columns
mushroom_cat = df_mushroom.columns[1:]
bank_cat = ["job", "marital", "education", "default", "housing", "loan", "contact", "month", "poutcome"]
car_cat = df_car.columns[1:]
zoo_cat = 13

# Encode categorical columns of dataframes
ohe_mushroom = pd.DataFrame(ohe.fit_transform(df_mushroom[mushroom_cat]))
ohe_mushroom.index = df_mushroom.index

ohe_bank = pd.DataFrame(ohe.fit_transform(df_bank[bank_cat]))
ohe_bank.index = df_bank.index
ohe_bank.columns = ohe_bank.columns.astype(str)

ohe_car = pd.DataFrame(ohe.fit_transform(df_car[car_cat]))
ohe_car.index = df_car.index

ohe_zoo = pd.DataFrame(ohe.fit_transform(df_zoo[[zoo_cat]]))
ohe_zoo.index = df_zoo.index

# Drop categorical columns from dataframe
df_bank.drop(labels=bank_cat,axis=1, inplace=True)
df_car.drop(labels=car_cat,axis=1, inplace=True)
df_zoo.drop(df_zoo.columns[zoo_cat], axis=1, inplace=True)

# Add encoded columns to dataframe
df_mushroom = pd.concat([df_mushroom['y'], ohe_mushroom], axis=1)
df_bank = pd.concat([df_bank, ohe_bank], axis=1)
df_car = pd.concat([df_car, ohe_car], axis=1)

ohe_zoo_col = list(range(df_zoo.shape[1]+1, ohe_zoo.shape[1]+df_zoo.shape[1]+1))
ohe_zoo.columns = ohe_zoo_col
df_zoo = df_zoo.join(ohe_zoo, how="left")




In [228]:
df_mushroom

Unnamed: 0,y,0,1,2,3,4,5,6,7,8,...,106,107,108,109,110,111,112,113,114,115
23,e,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
24,e,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
25,p,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
26,e,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
27,e,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
8120,e,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
8121,e,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
8122,p,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [229]:
## Label Encode Categorical Output Variables

# Create label encoder instance
le = LabelEncoder()

#Encode output variables
df_mushroom['y'] = le.fit_transform(df_mushroom['y']) # 2 categories: edible and poisonous
df_bank['y'] = le.fit_transform(df_bank['y'])         # 2 categories: yes and no
df_car['y'] = le.fit_transform(df_car['y'])           # 4 categories: unacceptable, acceptable, good, very good

**Import Models**

In [230]:
## Import classifiers with default hyperparameters 
models = {}

# KNN (A1)
from sklearn.neighbors import KNeighborsClassifier
models['KNN'] = KNeighborsClassifier()

# Support Vector Machine (A2)
from sklearn.svm import SVC
models['SVM'] = SVC() 

# Multi-layered perceptron (A3)
from sklearn.neural_network import MLPClassifier
models['MLP'] = MLPClassifier()

**Experiments and Predicitions**

In [231]:
# Function to find test accuracy for each experiment where the test size and classifier are varied
def accuracy_predictions (models, reps, dataset):
  accuracy_dataset = [{} for r in range(reps * test_percentage_levels)]

  #for each test size
  for t in range(len(TEST_SIZE)): 
    #do test train split
    Xtrain, Xtest, ytrain, ytest = train_test_split(dataset[dataset.columns[1:]],\
                                                   dataset.iloc[:,0], test_size=TEST_SIZE[t])
    
    #for each repetition
    for r in range(reps): 

      #print(TEST_SIZE[t], r, t, (t*reps)+r)
      #for each model
      for i, key in enumerate(models.keys()):
            
            # Fit the classifier model
            models[key].fit(Xtrain, ytrain)

            # Prediction
            predictions = models[key].predict(Xtest)

            # Calculate Accuracy, Precision and Recall Metrics
            accuracy_dataset[(t*reps)+r][key] = accuracy_score(predictions, ytest)
  return accuracy_dataset

In [232]:
# Repetitions of each experiment
repetitions = 3 
        
# Datasets split for training and testing
TEST_SIZE = [0.20, 0.50, 0.80]
test_percentage_levels = len(TEST_SIZE)

# Predictions For Datasets
accuracy_mushroom = accuracy_predictions(models, repetitions, df_mushroom)

accuracy_bank = accuracy_predictions(models, repetitions, df_bank)

accuracy_car = accuracy_predictions(models, repetitions, df_car)

accuracy_zoo = accuracy_predictions(models, repetitions, df_zoo)



**Print Accuracy Results for Test-Train Split**

In [234]:
def format_accuracy (accuracy_dataset, reps):
  #format = [ [] for r in range(repetitions * test_percentage_levels)]

  for r in range(repetitions * test_percentage_levels):
     print(list(accuracy_dataset[r].values()))

In [235]:
print_formatted = True

if (print_formatted):
  print("----------------ACCURACY DATA FORMAT ----------------")
  print("| TEST SPLIT %  |   repetition    | KNN | SVM | MLP |")
  print("|---------------------------------------------------|")

  for i in range(len(TEST_SIZE)):
    for j in range(3):
      print("|     "+str((TEST_SIZE[i]*100))+"      |       ", j+1 ,"       | [   ,     ,   ] |")


  print("\n\n-------------- RESULTS FOR TEST SIZES = ",TEST_SIZE," and REPETITIONS = "+str(repetitions)+" --------------------\n")
  print(" Accuracy for mushroom data [KNN | SVM | MLP]: \n")
  format_accuracy(accuracy_mushroom, repetitions)

  print("\n\n\n Accuracy for bank data [KNN | SVM | MLP]: \n")
  format_accuracy(accuracy_bank, repetitions)

  print("\n\n\n Accuracy for car data [KNN | SVM | MLP]: \n")
  format_accuracy(accuracy_car, repetitions)

  print("\n\n\n Accuracy for zoo data [KNN | SVM | MLP]: \n")
  format_accuracy(accuracy_zoo, repetitions)
else:
  print("\n\n-------------- RESULTS FOR TEST SIZES = ",TEST_SIZE," and REPETITIONS = "+str(repetitions)+" --------------------\n")
  format_accuracy(accuracy_mushroom, repetitions)
  format_accuracy(accuracy_bank, repetitions)
  format_accuracy(accuracy_car, repetitions)
  format_accuracy(accuracy_zoo, repetitions)


----------------ACCURACY DATA FORMAT ----------------
| TEST SPLIT %  |   repetition    | KNN | SVM | MLP |
|---------------------------------------------------|
|     20.0      |        1        | [   ,     ,   ] |
|     20.0      |        2        | [   ,     ,   ] |
|     20.0      |        3        | [   ,     ,   ] |
|     50.0      |        1        | [   ,     ,   ] |
|     50.0      |        2        | [   ,     ,   ] |
|     50.0      |        3        | [   ,     ,   ] |
|     80.0      |        1        | [   ,     ,   ] |
|     80.0      |        2        | [   ,     ,   ] |
|     80.0      |        3        | [   ,     ,   ] |


-------------- RESULTS FOR TEST SIZES =  [0.2, 0.5, 0.8]  and REPETITIONS = 3 --------------------

 Accuracy for mushroom data [KNN | SVM | MLP]: 

[1.0, 1.0, 1.0]
[1.0, 1.0, 1.0]
[1.0, 1.0, 1.0]
[1.0, 1.0, 1.0]
[1.0, 1.0, 1.0]
[1.0, 1.0, 1.0]
[0.9993828112945533, 0.9989199197654683, 0.99907421694183]
[0.9993828112945533, 0.9989199197654683, 0.999