# Eucalyptus data set

In [45]:
# imports
import time
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.io.arff import loadarff
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import KFold
from naivebayes import NaiveBayes
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder  

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

%matplotlib inline

In [46]:
# Load learn data
raw_data = loadarff("data/dataset_194_eucalyptus.arff");
df = pd.DataFrame(raw_data[0])

In [47]:
df.head()

Unnamed: 0,Abbrev,Rep,Locality,Map_Ref,Latitude,Altitude,Rainfall,Frosts,Year,Sp,PMCno,DBH,Ht,Surv,Vig,Ins_res,Stem_Fm,Crown_Fm,Brnch_Fm,Utility
0,b'Cra',1.0,b'Central_Hawkes_Bay',b'N135_382/137',b'39__38',100.0,850.0,-2.0,1980.0,b'co',1520.0,18.45,9.96,40.0,4.0,3.0,3.5,4.0,3.5,b'good'
1,b'Cra',1.0,b'Central_Hawkes_Bay',b'N135_382/137',b'39__38',100.0,850.0,-2.0,1980.0,b'fr',1487.0,13.15,9.65,90.0,4.5,4.0,3.5,3.5,3.0,b'best'
2,b'Cra',1.0,b'Central_Hawkes_Bay',b'N135_382/137',b'39__38',100.0,850.0,-2.0,1980.0,b'ma',1362.0,10.32,6.5,50.0,2.3,2.5,3.0,3.5,3.0,b'low'
3,b'Cra',1.0,b'Central_Hawkes_Bay',b'N135_382/137',b'39__38',100.0,850.0,-2.0,1980.0,b'nd',1596.0,14.8,9.48,70.0,3.7,3.0,3.3,4.0,3.5,b'good'
4,b'Cra',1.0,b'Central_Hawkes_Bay',b'N135_382/137',b'39__38',100.0,850.0,-2.0,1980.0,b'ni',2088.0,14.5,10.78,90.0,4.0,2.7,3.3,3.0,3.0,b'good'


In [48]:
df = df.dropna()

In [49]:
df.shape

(641, 20)

In [50]:
df.dtypes

Abbrev       object
Rep         float64
Locality     object
Map_Ref      object
Latitude     object
Altitude    float64
Rainfall    float64
Frosts      float64
Year        float64
Sp           object
PMCno       float64
DBH         float64
Ht          float64
Surv        float64
Vig         float64
Ins_res     float64
Stem_Fm     float64
Crown_Fm    float64
Brnch_Fm    float64
Utility      object
dtype: object

## Custom Implementation

In [51]:
metrics_our_model = {'time':{'Holdout': {'fit': 0, 'predict': 0, 'total-time': 0}, 'CV': 0},
                     'metrics': {'Holdout': {'accuracy': 0, 'precision':0, 'recall':0, 'f1':0},
                                'CV': {'accuracy': 0, 'precision':0, 'recall':0, 'f1':0}}}

In [52]:
train, test = train_test_split(df, test_size=0.3, random_state=42, shuffle=True)
X_test = test.drop(['Utility'], axis=1)
y_test = list(test['Utility'])

In [53]:
# Holdout on our model

# fitting the model
st1 = time.time()
nb = NaiveBayes()
nb.fit(train, 'Utility')
et1 = time.time()
elapsed_time1 = et1 - st1
metrics_our_model['time']['Holdout']['fit'] = elapsed_time1

# prediction
st2 = time.time()
predictions = nb.predict(X_test)
et2 = time.time()
elapsed_time2 = et2 - st2
total_time = et2-st1

metrics_our_model['time']['Holdout']['predict'] = elapsed_time2
metrics_our_model['time']['Holdout']['total-time'] = total_time
metrics_our_model['metrics']['Holdout']['accuracy'] = accuracy_score(y_test, predictions)
metrics_our_model['metrics']['Holdout']['precision'] = precision_score(y_test, predictions, average='macro', zero_division=0)
metrics_our_model['metrics']['Holdout']['recall'] = recall_score(y_test, predictions, average='macro', zero_division=0)
metrics_our_model['metrics']['Holdout']['f1'] = f1_score(y_test, predictions, average='macro', zero_division=0)

In [54]:
# Perform the CV on our own model

# create the folds
kf5 = KFold(n_splits=5, shuffle=False)

accuracy = 0
precision = 0
recall = 0
f1 = 0

st = time.time()

for train_index, test_index in kf5.split(df):
    
    # create the train/test split properly
    train = df.iloc[train_index]
    test = df.iloc[test_index]
    X_test = test.drop(['Utility'], axis=1)
    y_test = list(test['Utility'])

    # fitting the model
    nb = NaiveBayes()
    nb.fit(train, 'Utility')
    
    # prediction
    predictions = nb.predict(X_test)
    
    accuracy += accuracy_score(y_test, predictions)
    precision += precision_score(y_test, predictions, average='macro', zero_division=0)
    recall += recall_score(y_test, predictions, average='macro', zero_division=0)
    f1 += f1_score(y_test, predictions, average='macro', zero_division=0)
    
et = time.time()    
total_time = et - st

In [55]:
accuracy = accuracy / 5
precision = precision / 5
recall = recall / 5
f1 = f1 / 5

metrics_our_model['time']['CV'] = total_time
metrics_our_model['metrics']['CV']['accuracy'] = accuracy
metrics_our_model['metrics']['CV']['precision'] = precision
metrics_our_model['metrics']['CV']['recall'] = recall
metrics_our_model['metrics']['CV']['f1'] = f1

In [56]:
metrics_our_model

{'time': {'Holdout': {'fit': 0.30000925064086914,
   'predict': 0.4779543876647949,
   'total-time': 0.7779636383056641},
  'CV': 3.027630567550659},
 'metrics': {'Holdout': {'accuracy': 0.5492227979274611,
   'precision': 0.43068013468013466,
   'recall': 0.46889427557531,
   'f1': 0.4380979818190801},
  'CV': {'accuracy': 0.36677567829457364,
   'precision': 0.320500896125995,
   'recall': 0.28900319350826736,
   'f1': 0.23228184103460703}}}

## Scikit-learn NB

In [57]:
metrics_sklearn_NB = {'time':{'Holdout': {'fit': 0, 'predict': 0, 'total-time': 0}, 'CV': 0},
                     'metrics': {'Holdout': {'accuracy': 0, 'precision':0, 'recall':0, 'f1':0},
                                'CV': {'accuracy': 0, 'precision':0, 'recall':0, 'f1':0}}}

In [58]:
# holdout

le = LabelEncoder()
nominal_features = ["Abbrev", "Locality", "Map_Ref", "Latitude", "Sp"]

X = df.drop(columns=['Utility'])
y = df['Utility']

for feature in nominal_features:
    X[feature] = le.fit_transform(X[feature])

y = le.fit_transform (y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)

st1 = time.time()
gnb = GaussianNB()
gnb.fit(X_train, y_train)
et1 = time.time()

st2 = time.time()
predictions = gnb.predict(X_test)
et2 = time.time()

accuracy_score(y_test, predictions)

metrics_sklearn_NB['time']['Holdout']['fit'] = et1 - st1
metrics_sklearn_NB['time']['Holdout']['predict'] = et2 - st2
metrics_sklearn_NB['time']['Holdout']['total-time'] = et2 - st1
metrics_sklearn_NB['metrics']['Holdout']['accuracy'] = accuracy_score(y_test, predictions)
metrics_sklearn_NB['metrics']['Holdout']['precision'] = precision_score(y_test, predictions, average='macro', zero_division=0)
metrics_sklearn_NB['metrics']['Holdout']['recall'] = recall_score(y_test, predictions, average='macro', zero_division=0)
metrics_sklearn_NB['metrics']['Holdout']['f1'] = f1_score(y_test, predictions, average='macro', zero_division=0)

In [59]:
# Cross validation on the sklearn NB

# ovdje mi bude error, i treba preprocessing nekakav odraditi

X = df.drop(columns=['Utility'])
y = df['Utility']

for feature in nominal_features:
    X[feature] = le.fit_transform(X[feature])

y = le.fit_transform (y)

gnb = GaussianNB()

st = time.time()

accuracy_scores = cross_val_score(gnb, X, y, cv=5, scoring="accuracy")

et = time.time()

precision_scores = cross_val_score(gnb, X, y, cv=5, scoring="precision_macro")
recall_scores = cross_val_score(gnb, X, y, cv=5, scoring="recall_macro")
f1_scores = cross_val_score(gnb, X, y, cv=5, scoring="f1_macro")

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [60]:
metrics_sklearn_NB['time']['CV'] = et - st
metrics_sklearn_NB['metrics']['CV']['accuracy'] = accuracy_scores.mean()
metrics_sklearn_NB['metrics']['CV']['precision'] = precision_scores.mean()
metrics_sklearn_NB['metrics']['CV']['recall'] = recall_scores.mean()
metrics_sklearn_NB['metrics']['CV']['f1'] = f1_scores.mean()

In [61]:
metrics_sklearn_NB

{'time': {'Holdout': {'fit': 0.00500035285949707,
   'predict': 0.003999233245849609,
   'total-time': 0.00899958610534668},
  'CV': 0.047003746032714844},
 'metrics': {'Holdout': {'accuracy': 0.5595854922279793,
   'precision': 0.46063829787234045,
   'recall': 0.5089690121155639,
   'f1': 0.4782535074987905},
  'CV': {'accuracy': 0.4353076550387597,
   'precision': 0.401011363414508,
   'recall': 0.3961396148566881,
   'f1': 0.3749093482995372}}}

## Scikit-learn MLP

In [62]:
metrics_sklearn_mlp = {'time':{'Holdout': {'fit': 0, 'predict': 0, 'total-time': 0}, 'CV': 0},
                     'metrics': {'Holdout': {'accuracy': 0, 'precision':0, 'recall':0, 'f1':0},
                                'CV': {'accuracy': 0, 'precision':0, 'recall':0, 'f1':0}}}

In [63]:
# holdout

le = LabelEncoder()
nominal_features = ["Abbrev", "Locality", "Map_Ref", "Latitude", "Sp"]

X = df.drop(columns=['Utility'])
y = df['Utility']

for feature in nominal_features:
    X[feature] = le.fit_transform(X[feature])

y = le.fit_transform (y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)

st1 = time.time()
mlp = MLPClassifier()
mlp.fit(X_train, y_train)
et1 = time.time()

st2 = time.time()
predictions = mlp.predict(X_test)
et2 = time.time()

accuracy_score(y_test, predictions)

metrics_sklearn_mlp['time']['Holdout']['fit'] = et1 - st1
metrics_sklearn_mlp['time']['Holdout']['predict'] = et2 - st2
metrics_sklearn_mlp['time']['Holdout']['total-time'] = et2 - st1
metrics_sklearn_mlp['metrics']['Holdout']['accuracy'] = accuracy_score(y_test, predictions)
metrics_sklearn_mlp['metrics']['Holdout']['precision'] = precision_score(y_test, predictions, average='macro', zero_division=0)
metrics_sklearn_mlp['metrics']['Holdout']['recall'] = recall_score(y_test, predictions, average='macro', zero_division=0)
metrics_sklearn_mlp['metrics']['Holdout']['f1'] = f1_score(y_test, predictions, average='macro', zero_division=0)

In [64]:
# Cross validation on the sklearn NB

# ovdje mi bude error, i treba preprocessing nekakav odraditi

X = df.drop(columns=['Utility'])
y = df['Utility']

for feature in nominal_features:
    X[feature] = le.fit_transform(X[feature])

y = le.fit_transform (y)

mlp = MLPClassifier()

st = time.time()

accuracy_scores = cross_val_score(mlp, X, y, cv=5, scoring="accuracy")

et = time.time()

precision_scores = cross_val_score(mlp, X, y, cv=5, scoring="precision_macro")
recall_scores = cross_val_score(mlp, X, y, cv=5, scoring="recall_macro")
f1_scores = cross_val_score(mlp, X, y, cv=5, scoring="f1_macro")

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [65]:
metrics_sklearn_mlp['time']['CV'] = et - st
metrics_sklearn_mlp['metrics']['CV']['accuracy'] = accuracy_scores.mean()
metrics_sklearn_mlp['metrics']['CV']['precision'] = precision_scores.mean()
metrics_sklearn_mlp['metrics']['CV']['recall'] = recall_scores.mean()
metrics_sklearn_mlp['metrics']['CV']['f1'] = f1_scores.mean()

In [66]:
metrics_sklearn_mlp

{'time': {'Holdout': {'fit': 0.23549365997314453,
   'predict': 0.0,
   'total-time': 0.23549365997314453},
  'CV': 1.6499574184417725},
 'metrics': {'Holdout': {'accuracy': 0.3626943005181347,
   'precision': 0.20912482065997132,
   'recall': 0.3155769230769231,
   'f1': 0.2513223187243211},
  'CV': {'accuracy': 0.3293725775193798,
   'precision': 0.38728014150367623,
   'recall': 0.25081573223524445,
   'f1': 0.2507372333525697}}}