# Purpose:
Instantiate, train, and evaluate multiple classification models, establishing baseline performance metrics for each.
## Implementation:
Machine learning libraries used include scikit-learn, scikit-image, and Tensorflow's Keras API. Training and evaluation will occur with normalized arrays using the untuned, "out-of-the-box" classifiers seen below.
* Multi-layer Perceptron
* Convolutional Neural Network
* Random Forest
* K-Means
* K-Nearest Neighbors
* Support Vector Machine
* Logistic Regression
* AdaBoost
* Stochastic Gradient Descent
* Naive Bayes

### Import Required Libraries

In [None]:
import os
import time as t
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline

from skimage import io, transform
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics

import tensorflow.keras as k
from keras.losses import categorical_crossentropy
from keras.utils import np_utils

print('Keras version {}'.format(k.__version__))

TI = t.time()

In [None]:
# Useful functions and shortcuts
def sp(int):
    # Returns a blank string of length int
    return ' ' * int


def timer(ti, tf, rnd = 2):
    # Returns esapsed time in sec, min
    dif = tf - ti
    sc = round(dif, rnd)
    print('{} sec'.format(sc))
    return sc


def norm_pic(pic):
    # Returns normalized array as type float32
    return (pic - pic.mean()) / (pic.max() - pic.min()).astype('float32')


def batcher(num_pics):
    # Returns dict of batch sizes for CNN input
    lib = {'batch_size': []}
    for i in range(10, num_pics):
        batch_size = i + 1
        num = num_pics % batch_size
        if num == 0 and batch_size <= 150:
            iters = int(num_pics / batch_size)
            lib['batch_size'].append((batch_size, iters))
    return lib

def reshaper(xtrain, xtest, ytrain, ytest):
    xtr = np.asarray(xtrain).reshape(xtrain.shape[0], 28, 28, 1)
    xtst = np.asarray(xtest).reshape(xtest.shape[0], 28, 28, 1)
    ytr = np_utils.to_categorical(ytrain, 10)
    ytst = np_utils.to_categorical(ytest, 10)
    return (xtr, xtst, ytr, ytst)

# Set random state for all classifiers
rnd_st = 42

# 2) Prepare Overhead-MNIST

In [None]:
# File path
path = '../input/overheadmnist/overhead/'
path_tr = path + 'training/'

# Save files as dataframes
train = pd.read_csv(path + 'train.csv')
labels = pd.read_csv(path + 'labels.csv')
tr_labels = labels[labels['dataset'] == 'train'].drop('dataset', axis = 1)[['image', 'class', 'label']]
ts_labels = labels[labels['dataset'] == 'test'].drop('dataset', axis = 1)[['image', 'class', 'label']]
classes = pd.read_csv(path + 'classes.csv')

# Create master DF to export 
master_tr = tr_labels.join(train.drop('label', axis = 1))
master_ts = ts_labels.join(train.drop('label', axis = 1))

# Save master DFs for future notebooks
master_tr.to_csv('master_tr.csv')
master_ts.to_csv('master_ts.csv')

# Reference lists
clss_lst = classes['class'].values

# Store useful values
tot_pics = len(train)
num_classes = len(classes)
results_dict = {}

_ = timer(TI, t.time())

### Create training and validation sets

In [None]:
# Normalize arrays
X = norm_pic(train.drop('label', axis = 1))

# Create categorical labels
y = train['label']

# Split the trainig data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = .2, 
                                                  stratify = y, random_state = rnd_st)

# 3) Model Creation & Training
## Multi-Layer Perceptron (Neural Network)

In [None]:
from sklearn.neural_network import MLPClassifier

y_dum = pd.get_dummies(y_train)
y_val_dum = pd.get_dummies(y_val)
y_dum.columns = clss_lst
y_val_dum.columns = clss_lst

# Instantiate the classifier
mlp = MLPClassifier(random_state = rnd_st)

st = t.time()
# Fit the classifier to the training data
mlp.fit(X_train, y_dum)

# Makre predictions on evaluation data
mlp_pred = mlp.predict_proba(X_val)

_ = timer(st, t.time())

In [None]:
# Make predictions
y_pred = mlp.predict(X_val)

# Evaluate performance
mlp_rpt = metrics.classification_report(y_val_dum, y_pred, output_dict = True,
                                       target_names = clss_lst, zero_division = 1)
acc = metrics.accuracy_score(y_val_dum, y_pred)
conf = dict(zip(clss_lst, metrics.multilabel_confusion_matrix(y_val_dum, y_pred)))

# Save results
mlp_results = pd.DataFrame(mlp_rpt).T

results_dict['Multi-Layer Perceptron'] = {'accuracy' : acc, 
                       'classification report' : mlp_rpt, 
                       'confusion matrix' : conf}

print('accuracy: {}\n\n{}'.format(acc, mlp_results))

## Convolutional Neural Network

In [None]:
# Reshape input files 
X_tr, X_vl, y_tr, y_vl = reshaper(X_train, X_val, y_train, y_val)

In [None]:
# Instantiate classifier
cnn = k.Sequential()

# Add a layers & compile
cnn.add(k.layers.Conv2D(filters = 32, kernel_size = (3, 3)))
cnn.add(k.layers.Flatten())
cnn.add(k.layers.Dense(100))
cnn.add(k.layers.Dense(10, activation = 'softmax'))

cnn.compile(loss = 'categorical_crossentropy', 
            metrics = ['accuracy', k.metrics.CategoricalAccuracy()])

# Fit and evaluate
st = t.time()

cnn.fit(X_tr, y_tr)

# Make predictions on evaluation data
cnn_pred = np.round(cnn.predict(X_vl))

_ = timer(st, t.time())

In [None]:
# Evaluate performance
cnn_rpt = metrics.classification_report(y_vl, cnn_pred, output_dict = True, 
                                       target_names = clss_lst, zero_division = 1)
acc = metrics.accuracy_score(y_vl, cnn_pred)
conf = dict(zip(clss_lst, metrics.multilabel_confusion_matrix(y_vl, cnn_pred)))

# Save results
cnn_results = pd.DataFrame(cnn_rpt).T

results_dict['Convolutional Neural Network'] = {'accuracy' : acc, 
                       'classification report' : cnn_rpt, 
                       'confusion matrix' : conf}

print('accuracy: {}\n\n{}'.format(acc, cnn_results))

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Instantiate 
fst = RandomForestClassifier(random_state = rnd_st)

st = t.time()
# Fit
fst.fit(X_train, y_tr)

# Predict
fst_pred = fst.predict(X_val)

_ = timer(st, t.time())

In [None]:
# Evaluate performance
fst_rpt = metrics.classification_report(y_vl, fst_pred, output_dict = True, 
                                       target_names = clss_lst, zero_division = 1)
acc = metrics.accuracy_score(y_vl, fst_pred)
conf = dict(zip(clss_lst, metrics.multilabel_confusion_matrix(y_vl, fst_pred)))

# Save results
fst_results = pd.DataFrame(fst_rpt).T

results_dict['Random Forest'] = {'accuracy' : acc, 
                       'classification report' : fst_rpt, 
                       'confusion matrix' : conf}

print('accuracy: {}\n\n{}'.format(acc, fst_results))

## K-Means

In [None]:
from sklearn.cluster import KMeans

kmn = KMeans(n_clusters = 10, random_state = rnd_st)

st = t.time()
kmn.fit(X_train, y_train)

kmn_pred = np.asarray(pd.get_dummies(kmn.predict(X_val)))

_ = timer(st, t.time())

In [None]:
# Evaluate performance
kmn_rpt = metrics.classification_report(y_vl, kmn_pred, output_dict = True, zero_division = 1)
acc = metrics.accuracy_score(y_vl, kmn_pred)
conf = dict(zip(clss_lst, metrics.multilabel_confusion_matrix(y_vl, kmn_pred)))

# Save results
kmn_results = pd.DataFrame(kmn_rpt).T

results_dict['K-Means'] = {'accuracy' : acc, 
                       'classification report' : kmn_rpt, 
                       'confusion matrix' : conf}

print('accuracy: {}\n\n{}'.format(acc, kmn_results))

## K-Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()

st = t.time()
knn.fit(X_train, y_dum)

knn_pred = knn.predict(X_val)

_ = timer(st, t.time())

In [None]:
# Evaluate performance
knn_rpt = metrics.classification_report(y_vl, knn_pred, output_dict = True, zero_division = 1, 
                                       target_names = clss_lst)
acc = metrics.accuracy_score(y_vl, knn_pred)
conf = dict(zip(clss_lst, metrics.multilabel_confusion_matrix(y_vl, knn_pred)))

# Save results
knn_results = pd.DataFrame(knn_rpt).T

results_dict['K-Nearest Neighbors'] = {'accuracy' : acc, 
                       'classification report' : knn_rpt, 
                       'confusion matrix' : conf}

print('accuracy: {}\n\n{}'.format(acc, knn_results))

## Support Vector Machine

In [None]:
from sklearn.svm import SVC

svm = SVC()

st = t.time()
svm.fit(X_train, y_train)

svm_pred = np.asarray(pd.get_dummies(svm.predict(X_val)))

_ = timer(st, t.time())

In [None]:
# Evaluate performance
svm_rpt = metrics.classification_report(y_vl, svm_pred, output_dict = True, zero_division = 1, 
                                       target_names = clss_lst)
acc = metrics.accuracy_score(y_vl, svm_pred)
conf = dict(zip(clss_lst, metrics.multilabel_confusion_matrix(y_vl, svm_pred)))

# Save results
svm_results = pd.DataFrame(svm_rpt).T

results_dict['Support Vector Machine'] = {'accuracy' : acc, 
                       'classification report' : svm_rpt, 
                       'confusion matrix' : conf}

print('accuracy: {}\n\n{}'.format(acc, svm_results))

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

lgc = LogisticRegression()

st = t.time()
lgc.fit(X_train, y_train)

lgc_pred = np.asarray(pd.get_dummies(lgc.predict(X_val)))

_ = timer(st, t.time())

In [None]:
# Evaluate performance
lgc_rpt = metrics.classification_report(y_vl, lgc_pred, output_dict = True, zero_division = 1, 
                                       target_names = clss_lst)
acc = metrics.accuracy_score(y_vl, lgc_pred)
conf = dict(zip(clss_lst, metrics.multilabel_confusion_matrix(y_vl, lgc_pred)))

# Save results
lgc_results = pd.DataFrame(lgc_rpt).T

results_dict['Logistic Regression'] = {'accuracy' : acc, 
                       'classification report' : lgc_rpt, 
                       'confusion matrix' : conf}

print('accuracy: {}\n\n{}'.format(acc, lgc_results))

## AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier

adb = AdaBoostClassifier()

adb.fit(X_train, y_train)

st = t.time()
adb_pred = np.asarray(pd.get_dummies(adb.predict(X_val)))

_ = timer(st, t.time())

In [None]:
# Evaluate performance
adb_rpt = metrics.classification_report(y_vl, adb_pred, output_dict = True, 
                                        zero_division = 1, target_names = clss_lst)
acc = metrics.accuracy_score(y_vl, adb_pred)
conf = dict(zip(clss_lst, metrics.multilabel_confusion_matrix(y_vl, adb_pred)))

# Save results
adb_results = pd.DataFrame(adb_rpt).T

results_dict['AdaBoost'] = {'accuracy' : acc, 
                       'classification report' : adb_rpt, 
                       'confusion matrix' : conf}

print('accuracy: {}\n\n{}'.format(acc, adb_results))

## Stochastic Gradient Descent

In [None]:
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier()

st = t.time()
sgd.fit(X_train, y_train)

sgd_pred = sgd.predict(X_val)

_ = timer(st, t.time())

In [None]:
# Evaluate performance
sgd_rpt = metrics.classification_report(y_val, sgd_pred, output_dict = True, zero_division = 1,
                                        target_names = clss_lst)
acc = metrics.accuracy_score(y_val, sgd_pred)
conf = dict(zip(clss_lst, metrics.multilabel_confusion_matrix(y_val, sgd_pred)))

# Save results
sgd_results = pd.DataFrame(sgd_rpt).T

results_dict['Stochastic Gradient Descent'] = {'accuracy' : acc, 
                       'classification report' : sgd_rpt, 
                       'confusion matrix' : conf}

print('accuracy: {}\n\n{}'.format(acc, sgd_results))

## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()

st = t.time()
gnb.fit(X_train, y_train)

gnb_pred = gnb.predict(X_val)

_ = timer(st, t.time())

In [None]:
# Evaluate performance
gnb_rpt = metrics.classification_report(y_val, gnb_pred, output_dict = True, zero_division = 1,
                                        target_names = clss_lst)
acc = metrics.accuracy_score(y_val, gnb_pred)
conf = dict(zip(clss_lst, metrics.multilabel_confusion_matrix(y_val, gnb_pred)))

# Save results
gnb_results = pd.DataFrame(gnb_rpt).T

results_dict['Naive Bayes'] = {'accuracy' : acc, 
                       'classification report' : gnb_rpt, 
                       'confusion matrix' : conf}

print('accuracy: {}\n\n{}'.format(acc, gnb_results))

In [None]:
import json

resultsDF = pd.DataFrame(results_dict)
resultsDF.to_json('baselines.json')

# Calculate total elapsed run time
TF = t.time()
timer(TI, TF)

In [None]:
# Accuracy comparison
resultsDF.T['accuracy']

# Summary
A Support Vector Machine model shows the best 'out-of-the-box' performance. Gaussian Naive Bayes and Multi-Layer Perceptron models achieve > 50% accuracy. All other models fall below this threshold, averaging 42%. The least accurate model is K-Means at 9.6%. Two models, MLP and LR, both failed to converge. 

Final scores and confusion matrices are exported in json format as 'baselines.json'.
## Next Steps
* Complete EDA
* Image Processing
* Hyperparameter optimization