In [1]:
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras import callbacks

import glob
import os

%matplotlib inline

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
files = glob.glob('./data/**/*.npy')
im_files = glob.glob('./data/**/*.png')
labels = [f.split('/')[2] for f in files]

u_labels = set(labels)
label_dict = dict(zip(u_labels, range(len(u_labels))))
reverse_dict = dict(zip(label_dict.values(), label_dict.keys()))
num_labels = [label_dict[l] for l in labels]

num_labels = np.array(num_labels)
labels = np.array(labels)
len_data = len(labels)

Label imbalance:

In [3]:
top10 = Counter(labels).most_common()[:10]
top10

[('George_W_Bush', 530),
 ('Colin_Powell', 236),
 ('Tony_Blair', 144),
 ('Donald_Rumsfeld', 121),
 ('Gerhard_Schroeder', 109),
 ('Ariel_Sharon', 77),
 ('Hugo_Chavez', 71),
 ('Junichiro_Koizumi', 60),
 ('Jean_Chretien', 55),
 ('John_Ashcroft', 53)]

In [4]:
embeds = []
for f in files:
    embeds.append(np.load(f))
embeds = np.array(embeds)
embeds.shape

(4324, 128)

In [5]:
train_len = int(len(labels)*0.8)
missing_in_test = set([1])
# while loop is to ensure that test set has all its labels in train set
while len(missing_in_test) != 0:
    idx = np.random.permutation(len(labels))
    train_idx = idx[:train_len]
    test_idx = idx[train_len:]
    
    train_labels = num_labels[train_idx]
    test_labels = num_labels[test_idx]
    
    train_embeds = embeds[train_idx]
    test_embeds = embeds[test_idx]
    
    missing_in_test = set(test_labels) - set(train_labels)

Create a balanced dataset by resampling all classes 50 times.

In [6]:
n_repeat = 50
u_train_labels = np.unique(train_labels)

train_embeds2 = np.zeros((n_repeat*len(u_train_labels), train_embeds.shape[1]))
train_labels2 = np.zeros(n_repeat*len(u_train_labels), dtype=np.int32)
for i,l in enumerate(u_train_labels):
    idx = np.random.choice(np.where(train_labels==l)[0], n_repeat)
    train_embeds2[i*n_repeat:(i+1)*n_repeat,:] = train_embeds[idx]
    train_labels2[i*n_repeat:(i+1)*n_repeat] = train_labels[idx]
idx = np.random.permutation(len(train_embeds2))
train_embeds = train_embeds2[idx]
train_labels = train_labels2[idx]

## Keras Model

In [7]:
model = Sequential()
model.add(Dense(units=10, activation='relu', input_dim=train_embeds.shape[1]))
model.add(Dense(units=10, activation='relu'))
model.add(Dense(units=len(u_train_labels), activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=["accuracy"])
model.summary()

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 10)                1290      
_________________________________________________________________
dense_2 (Dense)              (None, 10)                110       
_________________________________________________________________
dense_3 (Dense)              (None, 158)               1738      
Total params: 3,138
Trainable params: 3,138
Non-trainable params: 0
_________________________________________________________________


In [8]:
epochs = 100
batch_size = 128
cb_list = [
            callbacks.ReduceLROnPlateau(
                monitor='val_loss',
                factor=0.5,
                patience=1,
                min_lr=1e-9
            )
    ]
model.fit(train_embeds, train_labels, batch_size=batch_size, epochs=epochs, verbose=0, validation_split=0.05)

<keras.callbacks.History at 0x18188ab240>

In [9]:
print(model.evaluate(train_embeds, train_labels))
print(model.evaluate(test_embeds, test_labels))

[3.7107421336596524, 0.1303797468411017]
[4.219648952153377, 0.06127167630057803]


The accuracy is still better than random (0.04 > 1/158). The other reason its doing much better than test set is most likely due to the fact that its learning to classify one class better than the other. There is a class imbalance in test set since I did not resample the test_labels. As seen below the resampled version has an improved accuracy.

In [10]:
n_repeat = 50
u_test_labels = np.unique(test_labels)

test_embeds2 = np.zeros((n_repeat*len(u_test_labels), test_embeds.shape[1]))
test_labels2 = np.zeros(n_repeat*len(u_test_labels), dtype=np.int32)
for i,l in enumerate(u_test_labels):
    idx = np.random.choice(np.where(test_labels==l)[0], n_repeat)
    test_embeds2[i*n_repeat:(i+1)*n_repeat,:] = test_embeds[idx]
    test_labels2[i*n_repeat:(i+1)*n_repeat] = test_labels[idx]

In [11]:
print(model.evaluate(test_embeds2, test_labels2))

[4.368647848430433, 0.06394736842105263]
