In [13]:
import os
import pandas as pd
from sklearn.model_selection import KFold
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
from tensorflow.keras.optimizers import Adagrad
from sklearn.metrics import classification_report
import numpy as np
import matplotlib.pyplot as plt
from keras.src.layers import Dropout

In [14]:
train_data =  "./data/train"
test_data = "./data/test"

In [15]:
def create_train_dataframe(directory):
    filepaths = []
    labels = []

    for class_label in os.listdir(directory):
        class_path = os.path.join(directory, class_label)

        if os.path.isdir(class_path):
            for filename in os.listdir(class_path):
                filepath = os.path.join(class_path, filename)
                filepaths.append(filepath)
                labels.append(class_label)

    return pd.DataFrame({'filepath': filepaths, 'label': labels})

In [16]:
def create_test_dataframe(folder_path, label = 'test'):
    filepaths = []
    labels = []

    for filename in os.listdir(folder_path):
        if filename.endswith(".jpg"):
            filepath = os.path.join(folder_path, filename)
            filepaths.append(filepath)
            labels.append(label)
            
    return pd.DataFrame({"filepath": filepaths, "label": labels})

In [17]:
print("Train df:")
train_df = create_train_dataframe(train_data)
train_df.head()

Train df:


Unnamed: 0,filepath,label
0,./data/train/Paper/image_811.jpg,Paper
1,./data/train/Paper/image_805.jpg,Paper
2,./data/train/Paper/image_193.jpg,Paper
3,./data/train/Paper/image_187.jpg,Paper
4,./data/train/Paper/image_839.jpg,Paper


In [18]:
print("Test df:")
test_df = create_test_dataframe(test_data)
test_df.head()

Test df:


Unnamed: 0,filepath,label
0,./data/test/test_image_1159.jpg,test
1,./data/test/test_image_588.jpg,test
2,./data/test/test_image_1165.jpg,test
3,./data/test/test_image_1171.jpg,test
4,./data/test/test_image_239.jpg,test


In [19]:
IMG_SIZE = 100

def cnn_model():
    model = Sequential()
    model.add(Conv2D(16, (3, 3), activation='relu', input_shape=(IMG_SIZE, IMG_SIZE, 1)))
    model.add(MaxPooling2D((2, 2)))
    model.add(Conv2D(32, (3, 3), activation='relu'))
    model.add(MaxPooling2D((2, 2)))
    model.add(Conv2D(64, (3, 3), activation='relu'))
    model.add(MaxPooling2D((2, 2)))
    model.add(Conv2D(128, (3, 3), activation='relu'))
    model.add(MaxPooling2D((2, 2)))
    model.add(Dropout(rate=0.5))
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dense(3, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    return model

In [20]:
train_datagen = ImageDataGenerator(rescale=1./255, validation_split=0.2)
test_datagen = ImageDataGenerator(rescale=1./255)

In [9]:
k_fold = KFold(n_splits=5, shuffle=True, random_state=1234)

In [21]:
results_base = {
    "accuracy": [],
    "precision": [],
    "recall": [],
    "f1": [],
    "history": []
}

In [22]:
epochs = 100

In [12]:
batch_size = 32

for train_index, test_index in k_fold.split(train_df):
    train_data = train_df.iloc[train_index]
    test_data = train_df.iloc[test_index]

    train_generator = train_datagen.flow_from_dataframe(
        dataframe=train_data,
        x_col="filepath",
        y_col="label",
        subset="training",
        batch_size=batch_size,
        target_size=(IMG_SIZE, IMG_SIZE),
        shuffle=True,
        seed=1234,
        color_mode='grayscale'
    )

    validation_generator = train_datagen.flow_from_dataframe(
      dataframe=train_data,
      x_col="filepath",
      y_col="label",
      subset="validation",
      batch_size=batch_size,
      target_size=(IMG_SIZE, IMG_SIZE),
      shuffle=True,
      seed=1234,
      color_mode='grayscale'
    )

    test_generator = test_datagen.flow_from_dataframe(
        dataframe=test_data,
        x_col="filepath",
        y_col="label",
        batch_size=1,
        target_size=(IMG_SIZE, IMG_SIZE),
        shuffle=False,
        color_mode='grayscale'
    )

    model = cnn_model()

    history = model.fit(
        train_generator,
        epochs=epochs,
        validation_data=validation_generator
    )

    results_base['history'].append(history)

    y_true = test_data['label']
    y_pred = model.predict(test_generator)

    y_pred_classes = np.argmax(y_pred, axis=1)
    class_indices_list = list(test_generator.class_indices.keys())
    y_pred_classes = [class_indices_list[i] for i in y_pred_classes]

    report = classification_report(y_true, y_pred_classes, output_dict=True)

    results_base['accuracy'].append(report['accuracy'])
    results_base['precision'].append(report['macro avg']['precision'])
    results_base['recall'].append(report['macro avg']['recall'])
    results_base['f1'].append(report['macro avg']['f1-score'])

    print(report)
    print()

Found 2016 validated image filenames belonging to 3 classes.
Found 504 validated image filenames belonging to 3 classes.
Found 630 validated image filenames belonging to 3 classes.
Epoch 1/100


KeyboardInterrupt: 

In [None]:
epochs_fig = range(1, epochs + 1)
fig, axs = plt.subplots(5, 2, figsize=(15, 20))

for i, history in enumerate(results_base['history']):
    train_loss = history.history['loss']
    val_loss = history.history['val_loss']
    train_acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']

    axs[i, 0].plot(epochs_fig, train_acc, 'b', label=f'Fold {i+1} Training accuracy')
    axs[i, 0].plot(epochs_fig, val_acc, 'r', label=f'Fold {i+1} Validation accuracy')
    axs[i, 0].set_title(f'Fold {i+1} Accuracy')
    axs[i, 0].set_xlabel('Epochs')
    axs[i, 0].set_ylabel('Accuracy')
    axs[i, 0].legend()

    axs[i, 1].plot(epochs_fig, train_loss, 'b', label=f'Fold {i+1} Training loss')
    axs[i, 1].plot(epochs_fig, val_loss, 'r', label=f'Fold {i+1} Validation loss')
    axs[i, 1].set_title(f'Fold {i+1} Loss')
    axs[i, 1].set_xlabel('Epochs')
    axs[i, 1].set_ylabel('Loss')
    axs[i, 1].legend()

plt.tight_layout()
plt.show()

In [None]:
def advanced_cnn_model():
    model = Sequential()
    model.add(Conv2D(16, (3, 3), activation='relu', input_shape=(IMG_SIZE, IMG_SIZE, 1)))
    model.add(MaxPooling2D((2, 2)))
    model.add(Conv2D(16, (3, 3), activation='relu'))
    model.add(MaxPooling2D((2, 2)))
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dense(3, activation='softmax'))
    optimizer = Adagrad(learning_rate=0.01)
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
results_advanced = {
    "accuracy": [],
    "precision": [],
    "recall": [],
    "f1": [],
    "history": []
}

In [None]:
batch_size = 128

for train_index, test_index in k_fold.split(train_df):
    train_data = train_df.iloc[train_index]
    test_data = train_df.iloc[test_index]
    
    train_generator = train_datagen.flow_from_dataframe(
        dataframe=train_data,
        x_col="filepath",
        y_col="label",
        subset="training",
        batch_size=batch_size,
        target_size=(IMG_SIZE, IMG_SIZE),
        shuffle=True,
        seed=1234,
        color_mode='grayscale'
    )

    validation_generator = train_datagen.flow_from_dataframe(
      dataframe=train_data,
      x_col="filepath",
      y_col="label",
      subset="validation",
      batch_size=batch_size,
      target_size=(IMG_SIZE, IMG_SIZE),
      shuffle=True,
      seed=1234,
      color_mode='grayscale'
    )
    
    test_generator = test_datagen.flow_from_dataframe(
        dataframe=test_data,
        x_col="filepath",
        y_col="label",
        batch_size=1,
        target_size=(IMG_SIZE, IMG_SIZE),
        shuffle=False,
        color_mode='grayscale'
    )

    model = advanced_cnn_model()

    history = model.fit(
        train_generator,
        epochs=epochs,
        validation_data=validation_generator
    )

    results_advanced['history'].append(history)
    
    y_true = test_data['label']
    y_pred = model.predict(test_generator)

    y_pred_classes = np.argmax(y_pred, axis=1)
    class_indices_list = list(test_generator.class_indices.keys())
    y_pred_classes = [class_indices_list[i] for i in y_pred_classes]

    report = classification_report(y_true, y_pred_classes, output_dict=True)
    
    results_advanced['accuracy'].append(report['accuracy'])
    results_advanced['precision'].append(report['macro avg']['precision'])
    results_advanced['recall'].append(report['macro avg']['recall'])
    results_advanced['f1'].append(report['macro avg']['f1-score'])
    
    print(report)
    print()

In [None]:
results = {
    "base": results_base,
    "advanced": results_advanced
}

In [None]:
metrics = ["accuracy", "precision", "recall", "f1"]
models = ["base", "advanced"]

for metric in metrics:
    plt.figure(figsize=(10, 5))
    plt.boxplot([results["base"][metric], results["advanced"][metric]], labels=models)
    plt.title(metric)
    plt.show()

In [None]:
fig, axs = plt.subplots(1, 4, figsize=(15, 4))

for i, metric in enumerate(metrics):
      ax = axs[i]
      max = -float('inf')
      
      for j, model in enumerate(models):
          values = results[model][metric]
          
          if len(values) == 0:
            values = [0]
          
          avg = np.mean(values)
       
          if avg > max:
            max = avg
              
          ax.bar(j, avg, 0.8, label=model)
          ax.text(j, avg + 0.01, str(round(avg, 3)), ha='center', va='bottom')


      ax.set_ylim([0, max * 1.09])
      ax.set_title(f'{metric}')
      ax.set_xticks([], [])
      ax.legend(loc='lower right')

plt.tight_layout()
plt.show()

In [23]:
batch_size = 32

train_generator = train_datagen.flow_from_dataframe(
    dataframe=train_df,
    x_col="filepath",
    y_col="label",
    subset="training",
    batch_size=batch_size,
    target_size=(IMG_SIZE, IMG_SIZE),
    shuffle=True,
    seed=1234,
    color_mode='grayscale'
)

validation_generator = train_datagen.flow_from_dataframe(
  dataframe=train_df,
  x_col="filepath",
  y_col="label",
  subset="validation",
  batch_size=batch_size,
  target_size=(IMG_SIZE, IMG_SIZE),
  shuffle=True,
  seed=1234,
  color_mode='grayscale'
)

test_generator = test_datagen.flow_from_dataframe(
    dataframe=test_df,
    x_col="filepath",
    y_col="label",
    batch_size=1, 
    target_size=(IMG_SIZE, IMG_SIZE),
    shuffle=False,
    color_mode='grayscale'
)

Found 2520 validated image filenames belonging to 3 classes.
Found 630 validated image filenames belonging to 3 classes.
Found 1350 validated image filenames belonging to 1 classes.


In [24]:
model = cnn_model()

In [25]:
history = model.fit(
    train_generator,
    epochs=epochs,
    validation_data=validation_generator
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [26]:
train_loss = history.history['loss']
val_loss = history.history['val_loss']
train_acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

print("Average train accuracy: ", np.mean(train_acc))
print("Average train loss: ", np.mean(train_loss))
print("Average val accuracy: ", np.mean(val_acc))
print("Average val loss: ", np.mean(val_loss))

Average train accuracy:  0.9899999976158143
Average train loss:  0.025697984551879927
Average val accuracy:  0.9684761762619019
Average val loss:  0.12292823862284422


In [27]:
y_pred = model.predict(test_generator)



In [28]:
y_pred_classes = np.argmax(y_pred, axis=1)
class_indices_list = list(train_generator.class_indices.keys())
print(class_indices_list)
y_pred_classes = [class_indices_list[i] for i in y_pred_classes]

['Paper', 'Rock', 'Scissors']


In [29]:
data = {'image': test_df['filepath'].apply(lambda x: os.path.basename(x)), 'predicted_class': y_pred_classes}
df = pd.DataFrame(data=data)
df.head()

Unnamed: 0,image,predicted_class
0,test_image_1159.jpg,Rock
1,test_image_588.jpg,Rock
2,test_image_1165.jpg,Paper
3,test_image_1171.jpg,Scissors
4,test_image_239.jpg,Paper


In [30]:
df.shape

(1350, 2)

In [32]:
df.to_csv('out/submission.csv', index=False)