## Preprocess the data

In [None]:
# Imports

import os
import numpy as np
import cv2

In [None]:
# Constants

IMG_SIZE = 50
BASE_PATH = 'the-simpsons-characters-dataset/'
DATASET_PATH = BASE_PATH + 'simpsons_dataset/'
TESTSET_PATH = BASE_PATH + 'kaggle_simpson_testset/kaggle_simpson_testset/'

### Download TestSet (if necessary)

In [None]:
if not (os.path.isdir(BASE_PATH)):
    from kaggle.api.kaggle_api_extended import KaggleApi
    
    api = KaggleApi()
    api.authenticate()
    api.dataset_download_files("alexattia/the-simpsons-characters-dataset")
    
    !!unzip the-simpsons-characters-dataset.zip -d the-simpsons-characters-dataset
    

In [None]:
LABELS = os.listdir(DATASET_PATH)

### Load Dataset

In [None]:
# Load train images and LABELS Functions

def load_data():   
    train_images = []
    train_labels = []

    for label in LABELS:
        label_folder = os.path.join(DATASET_PATH, label)
        
        if len(os.listdir(label_folder)) < 100:
            print("[WARNING]: {} has only {} images".format(label, len(os.listdir(label_folder))))
            LABELS.remove(label)
            continue
        
        for _image in os.listdir(label_folder):
            _image_path = os.path.join(label_folder, _image)
            img = cv2.imread(_image_path, cv2.IMREAD_GRAYSCALE)
            img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
            
            train_images.append(img)
            train_labels.append(LABELS.index(label))
    
    return np.array(train_images, dtype=np.float64), np.array(train_labels)

In [None]:
# Load data

train_images, train_labels = load_data()
        
print(train_images.shape)
print(LABELS)

In [None]:
# Random Shuffle Train Images and LABELS (in unison)

s = np.random.permutation(len(train_labels))
np.random.shuffle(s)

train_images = train_images[s]
train_labels = train_labels[s]

In [None]:
print(train_images.shape)

### Test Loaded Dataset

In [None]:
import matplotlib.pyplot as plt

In [None]:
# Plot Image Function

def plot_img_label(img, label):
    plt.subplot(5,5,i+1)
    plt.xticks([])
    plt.yticks([])
    plt.grid(False)
    plt.imshow(img, cmap=plt.cm.binary)
    plt.xlabel(label)
    
def plot_img_colorbar(img):
    plt.imshow(img)
    plt.colorbar()
    plt.grid(False)


In [None]:
plt.figure(figsize = (3, 3))
plot_img_colorbar(train_images[0])
plt.show()

In [None]:
plt.figure(figsize = (10, 10))
for i in range(25):
    plot_img_label(train_images[i], LABELS[train_labels[i]])
plt.show()

### Load TestSet

In [None]:

def load_testset(path):
    test_images, test_labels = [], []
    
    for _file in os.listdir(path):
        
        # get label
        label = _file.split('.')[0].split('_')[:-1]
        label = '_'.join(label)
        
        # get image
        _image_path = os.path.join(path, _file)
        img = cv2.imread(_image_path, cv2.IMREAD_GRAYSCALE)
        try:
            img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
        except:
            print("Error while resizing image")
            continue
            
        if not label in LABELS:
            print("Error - label not found")
            continue

        test_images.append(np.array(img))
        test_labels.append(LABELS.index(label))
        
    return np.array(test_images, dtype=np.float64), np.array(test_labels)

In [None]:

test_images, test_labels = load_testset(TESTSET_PATH)

print(test_images.shape)

## Build the model

In [None]:
# TensorFlow and tf.keras
import tensorflow
from tensorflow import keras

In [None]:
model = keras.Sequential([
    keras.layers.BatchNormalization(axis=-1, momentum=0.99, epsilon=0.001, center=True, scale=True, beta_initializer='zeros', gamma_initializer='ones', moving_mean_initializer='zeros', moving_variance_initializer='ones', beta_regularizer=None, gamma_regularizer=None, beta_constraint=None, gamma_constraint=None),
    keras.layers.Flatten(input_shape=(IMG_SIZE, IMG_SIZE)),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(len(LABELS), activation='softmax')
])

In [None]:
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy'])

In [None]:
model.fit(train_images, train_labels, epochs=10)

# Test the model

## Predict TestSet

In [None]:
predictions = model.predict(test_images)

In [None]:

for i in range(25):
    pred = LABELS[np.argmax(predictions[i])]
    exp = LABELS[test_labels[i]]
    plt.figure(figsize=(12, 12))
    plot_img_label(test_images[i], "pred: {} - exp: {}".format(pred, exp))
    plt.show()

In [None]:
wrong = 0

for i in range(len(predictions)):
    pred = LABELS[np.argmax(predictions[i])]
    exp = LABELS[test_labels[i]]
    
    if pred != exp:
        wrong += 1

print("Wrong predictions -> {}".format(wrong))

print("Test accuracy -> {}".format(1 - wrong/len(predictions)))

## Train analisis

### Check which images had the worst results

In [None]:
# Analize errors per label

label_error = [0 for x in LABELS]

for i in range(len(predictions)):
    pred = LABELS[np.argmax(predictions[i])]
    exp = LABELS[test_labels[i]]
    
    if pred != exp:
        label_error[np.argmax(predictions[i])] += 1
        
print(label_error)

In [None]:
# Convert label_error into pandas dataframe

import pandas as pd

df = pd.DataFrame(label_error, columns=['Error percentage'])
df.index = LABELS
df['Error percentage'] = df['Error percentage'] / len(test_images)
df

# Clear if Error is 0
df = df.replace(0, np.nan)
df = df.dropna()

df.plot(kind='bar')
df