In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn import model_selection
from sklearn.utils import class_weight
from sklearn.metrics import classification_report
from sklearn.metrics import plot_confusion_matrix
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 

warnings.filterwarnings('ignore')

## Read the Data

The Kreuz-Kreis-Plus-Minus-Gartenhag.csv dataset contains 28 x 28 pixel images of all black and white #, +, -, o and x handwritten characters.
The first column of the dataset contains the label of the image as integer:

- 0 equals #
- 1 equals +
- 2 equals -
- 3 equals o
- 4 equals x



In [None]:
df = pd.read_csv("../input/images/Kreuz-Kreis-Plus-Minus-Gartenhag.csv", sep=',', header=0)


After reading, check if there are any null values in the dataset:

In [None]:
print(df.isnull().sum())

## Preparation

In order to train different models, lets split the data set into x (all columns which contain pixel data) and y (the label).

After that, split the data into a train and a test set:

In [None]:
x = df.iloc[:, df.columns != '0']
y = df.iloc[:, :1]

x_train, x_test, y_train, y_test = train_test_split(x, y) # , train_size=11500

## Compare different algorithms

Next up, I'm going to compare the following algorithms:

- LogisticRegression
- RandomForestClassifier
- KNeighborsClassifier
- Support Vector Machine
- GaussianNB
- XGBClassifier

Im going to use 5-fold cross validation on the training set.

In addition I'm going to print a confusion matrix for every model:

In [None]:
dfs = []

models = [
    ('LogReg', LogisticRegression()),
    ('RF', RandomForestClassifier()),
    ('KNN', KNeighborsClassifier()),
    ('SVM', SVC()),
    ('GNB', GaussianNB()),
    ('XGB', XGBClassifier())
]

results = []
names = []
scoring = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted'] # , 'roc_auc'

for name, model in models:
    kfold = model_selection.KFold(n_splits=5, shuffle=True, random_state=90210)
    cv_results = model_selection.cross_validate(model, x_train.values, y_train.values, cv=kfold, scoring=scoring, error_score="raise")
    clf = model.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    results.append(cv_results)
    names.append(name)
    this_df = pd.DataFrame(cv_results)
    this_df['model'] = name
    dfs.append(this_df)
    
    confusion_matrix = plot_confusion_matrix(model, x_test, y_test)
    confusion_matrix.ax_.set_title(name + ':')

final = pd.concat(dfs, ignore_index=True)
final

## Collect metrics

Next up I'm going to collect metrics by using the bootstrap method.
The bootstrap method is a resampling technique used to estimate statistics on a population by sampling a dataset with replacement.

In [None]:
bootstraps = []
for model in list(set(final.model.values)):
    model_df = final.loc[final.model == model]
    bootstrap = model_df.sample(n=30, replace=True)
    bootstraps.append(bootstrap)
        
bootstrap_df = pd.concat(bootstraps, ignore_index=True)
results_long = pd.melt(bootstrap_df,id_vars=['model'],var_name='metrics', value_name='values')
time_metrics = ['fit_time','score_time'] # fit time metrics
## PERFORMANCE METRICS
results_long_nofit = results_long.loc[~results_long['metrics'].isin(time_metrics)] # get df without fit data
results_long_nofit = results_long_nofit.sort_values(by='values')
## TIME METRICS
results_long_fit = results_long.loc[results_long['metrics'].isin(time_metrics)] # df with fit data
results_long_fit = results_long_fit.sort_values(by='values')

In [None]:
plt.figure(figsize=(20, 12))
sns.set(font_scale=2.5)
g = sns.boxplot(x="model", y="values", hue="metrics", data=results_long_nofit, palette="Set3")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.title('Comparison of Model by Classification Metric')
plt.savefig('./benchmark_models_performance.png',dpi=300)

In [None]:
plt.figure(figsize=(20, 12))
sns.set(font_scale=2.5)
g = sns.boxplot(x="model", y="values", hue="metrics", data=results_long_fit, palette="Set3")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.title('Comparison of Model by Fit and Score Time')
plt.savefig('./benchmark_models_time.png',dpi=300)

In [None]:
metrics = list(set(results_long_nofit.metrics.values))
bootstrap_df.groupby(['model'])[metrics].agg([np.std, np.mean])

In [None]:
time_metrics = list(set(results_long_fit.metrics.values))
bootstrap_df.groupby(['model'])[time_metrics].agg([np.std, np.mean])

## Conclusion

Based on the experiment above, the Random Forest Classifier provides the best results.