In [1]:
import pandas as pd
import numpy as np
import os
import gdown
import math

import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import tensorflow_addons as tfa


import pandas as pd
import urllib.request
from urllib.parse import urlparse

import warnings
warnings.simplefilter(action='ignore')
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [3]:
#BASE_DIR ="./"
#IMAGES_DIR = "/kaggle/input/movie-poster-genre-2021/images/"
#IMAGES_DIR = "/kaggle/input/moviepostergenre20212/images/"
#IMAGES_DIR = "/kaggle/input/moviepostergenre20212/images/images/"
IMAGES_DIR = "../../../../../rainerenglisch-AIDA_movie_genre_cdn2/images/"

In [4]:
#https://drive.google.com/file/d/1VGMVj1keT6FWxlC0SWr0BHpvYcmaQgrV/view?usp=sharing
#url = 'https://drive.google.com/uc?id=1MlXZKtRUP7pOehDR9j5MMvBNXF4UNNUx'
url = 'https://drive.google.com/uc?id=1VGMVj1keT6FWxlC0SWr0BHpvYcmaQgrV'
holdout_fname = "df_holdout.csv"
if not os.path.exists(holdout_fname):
    gdown.download(url, holdout_fname, quiet=False)
    df_holdout = pd.read_csv(holdout_fname,sep=";")
    df_holdout.head()

#Bitte diese 1000 Movie IDs vom Training und vom gesamten Model Engineering ausschließen und 
#nur für die Einträge im Leaderboard verwenden.

In [5]:
parquet_fname = "df.parquet.gzip"
df = pd.read_parquet(parquet_fname)
#keep only rows where file exists in data set
df = df.loc[df["file_exists"]]
df['genre_id'] = df['genre_id'].apply(lambda x: list(x))

In [6]:

%%time
datagen = ImageDataGenerator(rescale=1 / 255.)#, validation_split=0.1)
BATCH_SIZE = 64
train_generator = datagen.flow_from_dataframe(
    dataframe=df.loc[~df.is_holdout],
    directory=IMAGES_DIR,
    x_col="filename",
    y_col="genre_id",
    batch_size=BATCH_SIZE,
    seed=42,
    shuffle=False,
    class_mode="categorical",    
    target_size=(299, 299),
    subset='training',
    validate_filenames=False
)

Found 266957 non-validated image filenames belonging to 19 classes.
CPU times: user 976 ms, sys: 90.9 ms, total: 1.07 s
Wall time: 1.03 s


In [7]:
%%time
#datagen = ImageDataGenerator(rescale=1 / 255.)#, validation_split=0.1)
valid_generator = datagen.flow_from_dataframe(
    dataframe=df.loc[df.is_holdout],
    directory=IMAGES_DIR,
    x_col="filename",
    y_col="genre_id",
    batch_size=BATCH_SIZE,
    seed=42,
    shuffle=False,
    class_mode="categorical",
    target_size=(299, 299),
    subset='training',
    validate_filenames=False,
)

Found 980 non-validated image filenames belonging to 19 classes.
CPU times: user 8.56 ms, sys: 110 µs, total: 8.67 ms
Wall time: 7.67 ms


# load model

In [8]:
model = keras.models.load_model("./final_model/DenseNet169_1/")#saved_model.pb")

In [9]:
checkpoint_path =  "./model_checkpoints/saved-model-06-0.46.hdf5"
model.load_weights(checkpoint_path)

# threshold optimization with test data

In [10]:
from keras import metrics
threshold = 0.35
f1_micro = tfa.metrics.F1Score(num_classes=19, average='micro', name='f1_micro',threshold=threshold), 
f1_macro = tfa.metrics.F1Score(num_classes=19, average='macro', name='f1_macro',threshold=threshold)
f1_weighted = tfa.metrics.F1Score(num_classes=19,  average='weighted', name='f1_score_weighted',threshold=threshold)


In [11]:
y_pred_test = model.predict(valid_generator)
y_true_test = [ [1 if i in e else 0 for i in range(19)] for e in valid_generator.labels]
y_true_test = np.array(y_true_test)

In [12]:
from sklearn.metrics import f1_score

ths = np.linspace(0.1, 0.5, 10)

pd.DataFrame({
    'threshold': ths, 
    'f1-micro': [f1_score(y_true_test, (y_pred_test > th)*1., average="micro") for th in ths],
    'f1-weighted': [f1_score(y_true_test, (y_pred_test > th)*1., average="weighted") for th in ths],
    'class' : "all"
    }
)

Unnamed: 0,threshold,f1-micro,f1-weighted,class
0,0.1,0.467956,0.475777,all
1,0.144444,0.503044,0.489854,all
2,0.188889,0.517184,0.488811,all
3,0.233333,0.520643,0.483615,all
4,0.277778,0.511047,0.4666,all
5,0.322222,0.494681,0.443013,all
6,0.366667,0.481085,0.427526,all
7,0.411111,0.450195,0.395599,all
8,0.455556,0.404662,0.355617,all
9,0.5,0.363463,0.317105,all


In [13]:
from sklearn.metrics import f1_score

ths = np.linspace(0.1, 0.5, 9)

df_ths = pd.DataFrame({'threshold' : ths}
)

for cl in range(19):
    col = pd.DataFrame({f'f1-class_{cl}': [f1_score(y_true_test[:,cl], (y_pred_test[:,cl] > th)*1.) for th in ths] 
                       })
    df_ths=pd.concat([df_ths,col],axis="columns")

df_ths.style.highlight_max(color = 'lightgreen', axis = 0)
df_ths

Unnamed: 0,threshold,f1-class_0,f1-class_1,f1-class_2,f1-class_3,f1-class_4,f1-class_5,f1-class_6,f1-class_7,f1-class_8,f1-class_9,f1-class_10,f1-class_11,f1-class_12,f1-class_13,f1-class_14,f1-class_15,f1-class_16,f1-class_17,f1-class_18
0,0.1,0.406877,0.352381,0.580311,0.627393,0.482143,0.456395,0.630189,0.121212,0.4,0.511696,0.318725,0.18125,0.421053,0.190476,0.168675,0.430177,0.448179,0.303797,0.081081
1,0.15,0.359375,0.362416,0.650307,0.634294,0.527132,0.487633,0.67884,0.036364,0.4,0.537906,0.329545,0.233333,0.422535,0.054054,0.161616,0.422175,0.538182,0.285714,0.112676
2,0.2,0.317949,0.26087,0.662252,0.649206,0.529148,0.495902,0.694935,0.0,0.363636,0.502128,0.335766,0.282486,0.422222,0.0,0.136986,0.435028,0.578723,0.313725,0.156863
3,0.25,0.231707,0.20202,0.661972,0.663851,0.463054,0.508314,0.706179,0.0,0.380952,0.459658,0.346667,0.302158,0.346154,0.0,0.147059,0.388158,0.606635,0.26087,0.2
4,0.3,0.225166,0.152174,0.671642,0.675651,0.417582,0.502762,0.700549,0.0,0.4,0.425068,0.297436,0.317757,0.321678,0.0,0.098361,0.332046,0.571429,0.1,0.242424
5,0.35,0.142857,0.046512,0.666667,0.681382,0.4,0.510769,0.672646,0.0,0.4,0.409496,0.2,0.333333,0.287879,0.0,0.101695,0.294372,0.563218,0.054054,0.206897
6,0.4,0.122137,0.023529,0.689076,0.660317,0.3875,0.478689,0.662441,0.0,0.333333,0.343234,0.133333,0.268657,0.274809,0.0,0.071429,0.214634,0.545455,0.0,0.230769
7,0.45,0.109375,0.023529,0.689076,0.611046,0.337838,0.427046,0.602041,0.0,0.352941,0.249084,0.083916,0.275862,0.198347,0.0,0.072727,0.149733,0.52,0.0,0.16
8,0.5,0.065041,0.023529,0.695652,0.575916,0.3,0.393822,0.555354,0.0,0.25,0.158103,0.057971,0.25,0.121739,0.0,0.074074,0.069364,0.468085,0.0,0.173913


In [14]:
argmax_index=df_ths.iloc[:,1:].idxmax(axis=0)
class_thresholds = df_ths.threshold[argmax_index].values
class_thresholds

array([0.1 , 0.15, 0.5 , 0.35, 0.2 , 0.35, 0.25, 0.1 , 0.15, 0.15, 0.25,
       0.35, 0.15, 0.1 , 0.1 , 0.2 , 0.25, 0.2 , 0.3 ])

In [15]:
f1_micro_opt_th = f1_score(y_true_test, (y_pred_test > class_thresholds)*1., average="micro")
f1_weighted_opt_th = f1_score(y_true_test, (y_pred_test > class_thresholds)*1., average="weighted")
print("Class thresholds optimized on test set:",
        f"f1_micro_opt_th: {f1_micro_opt_th:.3f}, f1_weighted_opt_th: {f1_weighted_opt_th:.3f}",
      sep="\n")

Class thresholds optimized on test set:
f1_micro_opt_th: 0.529, f1_weighted_opt_th: 0.523


In [16]:
# threshold optimization with training data

In [17]:
#datagen = ImageDataGenerator(rescale=1 / 255.)#, validation_split=0.1)
BATCH_SIZE = 64
train2_generator = datagen.flow_from_dataframe(
    dataframe=df.loc[~df.is_holdout].sample(20000),
    directory=IMAGES_DIR,
    x_col="filename",
    y_col="genre_id",
    batch_size=BATCH_SIZE,
    seed=42,
    shuffle=False,
    class_mode="categorical",    
    target_size=(299, 299),
    subset='training',
    validate_filenames=False
)

Found 20000 non-validated image filenames belonging to 19 classes.


In [None]:
y_pred_train = model.predict(train2_generator)

In [None]:
y_true_train = [ [1 if i in e else 0 for i in range(19)] for e in train2_generator.labels]
y_true_train = np.array(y_true_train)

In [None]:
from sklearn.metrics import f1_score

ths = np.linspace(0.1, 0.5, 9)

df_ths = pd.DataFrame({'threshold' : ths}
)

for cl in range(19):
    col = pd.DataFrame({f'f1-class_{cl}': [f1_score(y_true_train[:,cl], (y_pred_train[:,cl] > th)*1.) for th in ths]          
                       })
    df_ths=pd.concat([df_ths,col],axis="columns")

df_ths.style.highlight_max(color = 'lightgreen', axis = 0)
df_ths

In [None]:
argmax_index=df_ths.iloc[:,1:].idxmax(axis=0)
class_thresholds = df_ths.threshold[argmax_index].values
class_thresholds

In [None]:
f1_micro_opt_th = f1_score(y_true, (y_pred > class_thresholds)*1., average="micro")
f1_weighted_opt_th = f1_score(y_true, (y_pred > class_thresholds)*1., average="weighted")
print("Class thresholds optimized on training set:",
        f"f1_micro_opt_th: {f1_micro_opt_th:.3f}, f1_weighted_opt_th: {f1_weighted_opt_th:.3f}",
      sep="\n")