> # AIDA with Transfer Learning

# Imports

In [1]:
import pandas as pd
import numpy as np
import os
import math
import seaborn as sns

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import tensorflow_addons as tfa

#https://machinelearningmastery.com/how-to-use-transfer-learning-when-developing-convolutional-neural-network-models/
from keras.applications.inception_v3 import InceptionV3
from keras.applications.vgg16 import VGG16
from keras.applications.densenet import DenseNet169
from keras.models import Model
from keras import metrics
from keras.callbacks import ModelCheckpoint, TensorBoard
from numba import cuda

import sklearn.model_selection as skms
from sklearn.utils import class_weight

#from wcs.google import google_drive_share
import urllib.request
from urllib.parse import urlparse

#from google.colab import drive
import src.helper.helper as hlp
import src.helper.const as const
import datetime as dt
import time

import warnings
warnings.simplefilter(action='ignore')
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

# Configuration

In [2]:
# Config
DP_TFDATA = "Data pipeline using tf.data"
DP_IMGGEN = "Data pipeline using tf.keras.ImageGenerator"
DP = DP_IMGGEN

LR = 1e-5 # Keep it small when transfer learning
BATCH_SIZE = 64
EPOCHS = 2
AUTOTUNE = tf.data.experimental.AUTOTUNE  # Adapt preprocessing and prefetching dynamically to reduce GPU and CPU idle time
SHUFFLE_BUFFER_SIZE = 1024 # Shuffle the training data by a chunck of 1024 observations

IMG_DIMS = [299, 299]
IMG_CHANNELS = 3  # Keep RGB color channels to match the input format of the model
LABEL_COLS = ['Action', 'Adventure', 'Animation', 'Comedy', 'Crime',
              'Documentary', 'Drama', 'Family', 'Fantasy', 'History', 'Horror',
              'Music', 'Mystery', 'Romance', 'Science Fiction', 'TV Movie',
              'Thriller', 'War', 'Western']

DIR = './'
DATA_DIR_POSTER = DIR + '../data/raw/posters_v3/'
DATA_DIR_INTERIM = DIR + "../data/interim/"
DATA_DIR_RAW = DIR + "../data/raw/"
MODEL_DIR = DIR + "../models/"
BASE_DIR = DIR
IMAGES_DIR = DATA_DIR_POSTER
SEED = const.SEED
TENSORBOARD_LOGDIR = DIR + "tensorboard_logs/scalars/"

In [3]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    # Create virtual GPUs
    try:
        tf.config.experimental.set_virtual_device_configuration(
            #OK, but solwer: 
            #gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=2.5*1024),
            #      tf.config.experimental.VirtualDeviceConfiguration(memory_limit=2.5*1024),
            #      tf.config.experimental.VirtualDeviceConfiguration(memory_limit=2.5*1024),
            #      tf.config.experimental.VirtualDeviceConfiguration(memory_limit=2.5*1024)],
            gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=5*1024),
                      tf.config.experimental.VirtualDeviceConfiguration(memory_limit=5*1024)],
            #Error: gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=10*1024)],
        )
        
        tf.config.experimental.set_virtual_device_configuration(
            #OK, but solwer: 
            #gpus[1], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=2.5*1024),
            #      tf.config.experimental.VirtualDeviceConfiguration(memory_limit=2.5*1024),
            #      tf.config.experimental.VirtualDeviceConfiguration(memory_limit=2.5*1024),
            #      tf.config.experimental.VirtualDeviceConfiguration(memory_limit=2.5*1024)],
            gpus[1], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=5*1024),
                      tf.config.experimental.VirtualDeviceConfiguration(memory_limit=5*1024)],
            #Error: gpus[1], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=10*1024)],            
        )
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPU,", len(logical_gpus), "Logical GPUs")

    except RuntimeError as e:
        # Virtual devices must be set before GPUs have been initialized
        print(e)

2 Physical GPU, 4 Logical GPUs


# Helper

In [4]:
def init_devices():
    # Check GPUs
    num_gpu = len(tf.config.list_physical_devices('GPU'))
    print("Num GPUs Available: ", num_gpu)

    gpus = tf.config.list_physical_devices('GPU')
    print(f"{f'Physical GPU Device: {gpus}' if gpus else 'No GPU available'}")

    if gpus:
        # Restrict TensorFlow to only allocate 6GB of memory on the first GPU
        try:
            """
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
            print(f"Set memory growth for {len(gpus)} physical GPU(s)")
            """
            mem_lim = 10*1024  # 6GB
            tf.config.experimental.set_virtual_device_configuration(
                gpus[0],
                [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=mem_lim)])        
            #logical_gpus = tf.config.experimental.list_logical_devices('GPU')
            #print(f"Set memory usage to {mem_lim/1000} GB for {len(gpus)} physical GPU(s) -> {len(logical_gpus)} logical GPU(s)")
            print(f"Set memory usage to {mem_lim/1000} GB for {len(gpus)} physical GPU(s)")

        except RuntimeError as e:
            # Virtual devices must be set before GPUs have been initialized
            print(e)
        print("GPU(s) will be automatically choosen for model calculations below.")
    else:
        print("CPUs will be automatically choosen for model calculations below.")

# Datapipeline based on tf.data

In [5]:
def parse_function(filename, label):
    """Function that returns a tuple of normalized image array and labels array.
    Args:
        filename: string representing path to image
        label: 0/1 one-dimensional array of size N_LABELS
    """
    # Read an image from a file
    image_string = tf.io.read_file(DATA_DIR_POSTER + filename)
    # Decode it into a dense vector
    image_decoded = tf.image.decode_jpeg(image_string, channels=IMG_CHANNELS)
    # Resize it to fixed shape
    image_resized = tf.image.resize(image_decoded, [IMG_DIMS[0], IMG_DIMS[1]])
    # Normalize it from [0, 255] to [0.0, 1.0]
    image_normalized = image_resized / 255.0
    return image_normalized, label


def create_dataset(filenames, labels, cache=True):
    """Load and parse dataset.
    Args:
        filenames: list of image paths
        labels: numpy array of shape (BATCH_SIZE, N_LABELS)
        is_training: boolean to indicate training mode
    """
    # Create a first dataset of file paths and labels
    dataset = tf.data.Dataset.from_tensor_slices((filenames, labels))
    # Parse and preprocess observations in parallel
    dataset = dataset.map(parse_function, num_parallel_calls=AUTOTUNE)
    
    if cache == True:
        # This is a small dataset, only load it once, and keep it in memory.
        dataset = dataset.cache()
        # Shuffle the data each buffer size
        dataset = dataset.shuffle(buffer_size=SHUFFLE_BUFFER_SIZE)
        
    # Batch the data for multiple steps
    dataset = dataset.batch(BATCH_SIZE)
    # Fetch batches in the background while the model is training.
    dataset = dataset.prefetch(buffer_size=AUTOTUNE)
    
    return dataset

# Preproc

In [6]:
#parquet_fname = DATA_DIR_INTERIM + "df_train_unbalanced_v3.gzip"
parquet_fname = DATA_DIR_INTERIM + "df_train_balanced_v3.gzip"
#!rm $parquet_fname

In [7]:
df = pd.read_parquet(parquet_fname)
#df['genre_id'] = df['genre_id'].apply(lambda x: list(x))

df.head()

Unnamed: 0,adult,id,original_title,popularity,video,url,poster_url,Action,Adventure,Animation,...,Romance,Science Fiction,TV Movie,Thriller,War,Western,filename,genre_id,genre_ids2,genre_ids2_list
0,False,52826,Nothing Lasts Forever,5.658,False,https://www.themoviedb.org/movie/52826,https://www.themoviedb.org/t/p/w500//ph2L3Rp3X...,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,ph2L3Rp3XbMzxuLTTQBxvtNgF13.jpg,"[878, 35, 14]","[Science Fiction,Comedy,Fantasy]","[Science Fiction, Comedy, Fantasy]"
1,False,460059,Burn Out,32.045,False,https://www.themoviedb.org/movie/460059,https://www.themoviedb.org/t/p/w500//3LeFOvzjZ...,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,3LeFOvzjZuIC7cQiXDeSIy1ym7a.jpg,"[28, 53]","[Action,Thriller]","[Action, Thriller]"
2,False,86674,パーク アンド ラブホテル,1.677,False,https://www.themoviedb.org/movie/86674,https://www.themoviedb.org/t/p/w500//8KAgoOwi3...,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,8KAgoOwi3KVtp8pwnmtsfoQSkEh.jpg,"[18, 10749]","[Drama,Romance]","[Drama, Romance]"
3,False,169298,Bullet,11.017,False,https://www.themoviedb.org/movie/169298,https://www.themoviedb.org/t/p/w500//oSYnKLSl1...,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,oSYnKLSl11aoZqJQIK9zoV63l3D.jpg,"[28, 80, 53]","[Action,Crime,Thriller]","[Action, Crime, Thriller]"
4,False,550654,Every Other Holiday,6.878,False,https://www.themoviedb.org/movie/550654,https://www.themoviedb.org/t/p/w500//8DetMslOB...,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,8DetMslOBOBKsT2cXBDBHP0ayVF.jpg,"[10770, 10751, 10749, 18]","[TV Movie,Family,Romance,Drama]","[TV Movie, Family, Romance, Drama]"


In [8]:
type(df['genre_id'].head().iloc[1])


list

In [9]:
df.head()

Unnamed: 0,adult,id,original_title,popularity,video,url,poster_url,Action,Adventure,Animation,...,Romance,Science Fiction,TV Movie,Thriller,War,Western,filename,genre_id,genre_ids2,genre_ids2_list
0,False,52826,Nothing Lasts Forever,5.658,False,https://www.themoviedb.org/movie/52826,https://www.themoviedb.org/t/p/w500//ph2L3Rp3X...,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,ph2L3Rp3XbMzxuLTTQBxvtNgF13.jpg,"[878, 35, 14]","[Science Fiction,Comedy,Fantasy]","[Science Fiction, Comedy, Fantasy]"
1,False,460059,Burn Out,32.045,False,https://www.themoviedb.org/movie/460059,https://www.themoviedb.org/t/p/w500//3LeFOvzjZ...,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,3LeFOvzjZuIC7cQiXDeSIy1ym7a.jpg,"[28, 53]","[Action,Thriller]","[Action, Thriller]"
2,False,86674,パーク アンド ラブホテル,1.677,False,https://www.themoviedb.org/movie/86674,https://www.themoviedb.org/t/p/w500//8KAgoOwi3...,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,8KAgoOwi3KVtp8pwnmtsfoQSkEh.jpg,"[18, 10749]","[Drama,Romance]","[Drama, Romance]"
3,False,169298,Bullet,11.017,False,https://www.themoviedb.org/movie/169298,https://www.themoviedb.org/t/p/w500//oSYnKLSl1...,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,oSYnKLSl11aoZqJQIK9zoV63l3D.jpg,"[28, 80, 53]","[Action,Crime,Thriller]","[Action, Crime, Thriller]"
4,False,550654,Every Other Holiday,6.878,False,https://www.themoviedb.org/movie/550654,https://www.themoviedb.org/t/p/w500//8DetMslOB...,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,8DetMslOBOBKsT2cXBDBHP0ayVF.jpg,"[10770, 10751, 10749, 18]","[TV Movie,Family,Romance,Drama]","[TV Movie, Family, Romance, Drama]"


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15015 entries, 0 to 15440
Data columns (total 30 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   adult            15015 non-null  bool   
 1   id               15015 non-null  int64  
 2   original_title   15015 non-null  object 
 3   popularity       15015 non-null  float64
 4   video            15015 non-null  bool   
 5   url              15015 non-null  object 
 6   poster_url       15015 non-null  object 
 7   Action           15015 non-null  float64
 8   Adventure        15015 non-null  float64
 9   Animation        15015 non-null  float64
 10  Comedy           15015 non-null  float64
 11  Crime            15015 non-null  float64
 12  Documentary      15015 non-null  float64
 13  Drama            15015 non-null  float64
 14  Family           15015 non-null  float64
 15  Fantasy          15015 non-null  float64
 16  History          15015 non-null  float64
 17  Horror      

 Create ImageGenerators

# Simple Model

In [11]:
fname_model_densnet169 = MODEL_DIR + "densenet169_weights_tf_dim_ordering_tf_kernels_notop.h5"
fname_model_vgg16 = MODEL_DIR + "vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5"

In [12]:
def model_create():
    # load model
    model = VGG16(include_top=False,  
                  input_shape=(299,299, 3),
                  weights=fname_model_vgg16)
    model_name = "VGG16"
    
    #model = DenseNet169(include_top=False,  
    #                    input_shape=(299,299, 3),
    #                    weights=fname_model_densnet169)

   
    #x= layers.Flatten()(model.layers[-1].output)
    x=layers.GlobalMaxPool2D()(model.layers[-1].output)
    
    #x= layers.Dense(128, activation='relu')(x)
    x= layers.Dense(1024, activation='relu')(x)
    x= layers.Dense(128, activation='relu')(x)
    
    #x= layers.Dropout(0.5)(x)
    #classifications = layers.Dense(len(train_generator.class_indices), activation='sigmoid')(x)
    classifications = layers.Dense(number_of_classes, activation='sigmoid')(x)
    
    model = Model(inputs=model.inputs, 
                  outputs=classifications,
                  name=model_name)

    #display(model.summary())
    return model, model_name

Finally, we implemented a standard DenseNet-169 architecture with similar modifications. The final
fully-connected layer of 1000 units was once again replaced by 3 sequential fully-connected layers of
3
1024, 128, and 7 units with ReLU, ReLU, and sigmoid activations respectively. The entire model
consists of 14,479,943 parameters, out of which, 14,321,543 were trainable.

In [13]:
!mkdir model_checkpoints

A subdirectory or file model_checkpoints already exists.


In [14]:
#tf.debugging.set_log_device_placement(True)
l_rtc_names = [            
    #"2-GPU_MirroredStrategy",
    #"2-GPU_CentralStorageStrategy",        
    #"1-GPU",    
    "56_CPU"
    #"2-GPU_MirroredStrategy_NCCL-All-Reduced",
]
l_rtc = [        
    #tf.distribute.MirroredStrategy().scope(),        
    #tf.distribute.experimental.CentralStorageStrategy().scope(),        
    #tf.device("/GPU:0"),       
    tf.device("/CPU:0"),
    #tf.distribute.MirroredStrategy(cross_device_ops=tf.distribute.NcclAllReduce()).scope(),                
]
for dp in [DP_TFDATA]:
    for i, runtime_context in enumerate(l_rtc):   
        print(f"Runtime Context: {l_rtc_names[i]}")

        # Start time measurement
        tic = time.perf_counter()

        # Create data pipeline
        print(dp)
        df_train, df_valid = skms.train_test_split(df, test_size=0.2, random_state=SEED)

        if dp == DP_IMGGEN:
            datagen = ImageDataGenerator(rescale=1 / 255.)#, validation_split=0.1)

            train_generator = datagen.flow_from_dataframe(
                dataframe=df_train,
                directory=IMAGES_DIR,
                x_col="filename",
                y_col="genre_id",
                batch_size=BATCH_SIZE,
                seed=SEED,
                shuffle=True,
                class_mode="categorical",    
                target_size=(299, 299),
                subset='training',
                validate_filenames=True
            )

            valid_generator = datagen.flow_from_dataframe(
                dataframe=df_valid,
                directory=IMAGES_DIR,
                x_col="filename",
                y_col="genre_id",
                batch_size=BATCH_SIZE,
                seed=SEED,
                shuffle=False,
                class_mode="categorical",
                target_size=(299, 299),
                subset='training',
                validate_filenames=True
            )
        else:
            X_train = df_train.filename.to_numpy()
            y_train = df_train[LABEL_COLS].to_numpy()
            X_valid = df_valid.filename.to_numpy()
            y_valid = df_valid[LABEL_COLS].to_numpy()

            train_generator = create_dataset(X_train, y_train, cache=True)
            valid_generator = create_dataset(X_valid, y_valid, cache=True)

            print(f"{len(X_train)} training datasets, using {y_train.shape[1]} classes")
            print(f"{len(X_valid)} validation datasets, unsing {y_valid.shape[1]} classes")

        if dp == DP_IMGGEN:
            # show class indicies
            print(train_generator.class_indices)
            print('length:', len(train_generator.class_indices))

            list(train_generator.class_indices.keys())

            #https://datascience.stackexchange.com/questions/13490/how-to-set-class-weights-for-imbalanced-classes-in-keras
            #In order to calculate the class weight do the following
            class_weights = class_weight.compute_class_weight('balanced',
                                                          np.array(list(train_generator.class_indices.keys()),dtype="int"), 
                                                          np.array(df.genre_id.explode(),dtype="int"))

            class_weights_genre_id = dict(zip(list(train_generator.class_indices), class_weights))
            display(class_weights_genre_id)
            class_weights = dict(zip(list(range(len(class_weights))), class_weights))
            print(class_weights)

            map_gender={"28":"Action",
            "12":"Adventure",
            "16":"Animation",
            "35":"Comedy",
            "80":"Crime",
            "99":"Documentary",
            "18":"Drama",
            "10751":"Family",
            "14":"Fantasy",
            "36":	"History",
            "27":"Horror",
            "10402"	:"Music",
            "9648":"Mystery",
            "10749":"Romance",
            "878"	:"Science Fiction",
            "10770":"TV Movie",
            "53":"Thriller",
            "10752":"War",
            "37":"Western"}

            series_genre_id_counts = df.genre_id.explode().value_counts()
            series_genre_id_counts
            df_genre = pd.DataFrame(series_genre_id_counts)
            df_genre["id"] = df_genre.index
            df_genre.rename(columns={"genre_id" : "count"},inplace=True)
            df_genre["name"] = df_genre["id"].apply(lambda x : map_gender[str(x)])
            df_genre["weight"] = df_genre["id"].apply(lambda x : class_weights_genre_id[x])
            df_genre.sort_values(by="count")

            number_of_classes = len(train_generator.class_indices)
        else:
            class_weights = None
            number_of_classes = len(LABEL_COLS)            

        # Create and train model
        with runtime_context:    
            model, model_name = model_create()

            # Define Tensorflow callback log-entry
            model_name_full = f"{model.name}_{l_rtc_names[i]}_{dt.datetime.now().strftime('%Y%m%d-%H%M%S')}"
            tb_logdir = f"{TENSORBOARD_LOGDIR}{model_name_full}"
            #checkpoint_path = "model_checkpoints/saved-model-06-0.46.hdf5"
            #model.load_weights(checkpoint_path)

            # mark loaded layers as not trainable
            # except last layer
            leng = len(model.layers)
            print(leng)
            for i,layer in enumerate(model.layers):
                if leng-i == 5:
                  print("stopping at",i)
                  break
                layer.trainable = False

            # Def metrics
            threshold = 0.35
            f1_micro = tfa.metrics.F1Score(num_classes=19, average='micro', name='f1_micro',threshold=threshold), 
            f1_macro = tfa.metrics.F1Score(num_classes=19, average='macro', name='f1_macro',threshold=threshold)
            f1_weighted = tfa.metrics.F1Score(num_classes=19,  average='weighted', name='f1_score_weighted',threshold=threshold)

            # Compile model
            model.compile(
                optimizer='adam', 
                loss="binary_crossentropy", 
                metrics=["categorical_accuracy",
                         tf.keras.metrics.AUC(multi_label = True),#,label_weights=class_weights),
                         f1_micro,
                         f1_macro,
                         f1_weighted,
                        ])

            print("create callbacks")
            #filepath = "model_checkpoints/{model_name}_saved-model-{epoch:02d}-{val_f1_score_weighted:.2f}.hdf5"
            #cb_checkpoint = ModelCheckpoint(filepath, monitor='val_f1_score_weighted', verbose=1, save_best_only=True, mode='max')

            cb_tensorboard = TensorBoard(
                log_dir = tb_logdir,
                histogram_freq=0, 
                update_freq='epoch',
                write_graph=True, 
                write_images=False)
            #callbacks_list = [cb_checkpoint, cb_tensorboard]
            #callbacks_list = [cb_checkpoint]
            callbacks_list = [cb_tensorboard]

            # Train model
            print("model fit")
            history = model.fit(
                train_generator,
                validation_data=valid_generator,
                epochs=EPOCHS,
                # reduce steps per epochs for faster epochs
                #steps_per_epoch = math.ceil(266957 / BATCH_SIZE /8),
                #class_weight = class_weights,
                callbacks=callbacks_list,
                use_multiprocessing=False
            )

            print("Saving final model")
            #model.save(MODEL_DIR + model_name_full)

            print("Saving final model weights")
            #model.save_weights(MODEL_DIR + model_name_full + ".ckpt")

            # Measure time of loop
            toc = time.perf_counter()
            secs_all = toc - tic
            mins = int(secs_all / 60)
            secs = int((secs_all - mins*60))
            print(f"Time spend for current run: {secs_all:0.4f} seconds => {mins}m {secs}s")

            #print("Reset GUPs")
            #cuda.select_device(0)
            #cuda.reset()
            #device = cuda.get_current_device()
            #device.reset()

Runtime Context: 56_CPU
Data pipeline using tf.data
12012 training datasets, using 19 classes
3003 validation datasets, unsing 19 classes
23
stopping at 18
create callbacks
model fit
Epoch 1/2
Instructions for updating:
use `tf.profiler.experimental.stop` instead.
Epoch 2/2
Saving final model
Saving final model weights
Time spend for current run: 6275.7764 seconds => 104m 35s


In [15]:
y_pred = model.predict(valid_generator)

In [27]:
y_pred

array([[0.04333003, 0.06330013, 0.01017892, ..., 0.05667298, 0.10052919,
        0.03578316],
       [0.05460956, 0.0506499 , 0.00554478, ..., 0.04023005, 0.12983356,
        0.05958107],
       [0.03774279, 0.06244115, 0.14584701, ..., 0.03503251, 0.04352151,
        0.01338559],
       ...,
       [0.06977029, 0.04619539, 0.01952854, ..., 0.07278735, 0.07186844,
        0.03285542],
       [0.04890267, 0.11080675, 0.32461283, ..., 0.01426101, 0.03933338,
        0.05369532],
       [0.05003528, 0.13841233, 0.01661889, ..., 0.01224749, 0.04455657,
        0.12601565]], dtype=float32)

In [25]:
y_pred.min()

0.0012278266

In [17]:
y_true = [ [1 if i in e else 0 for i in range(19)] for e in valid_generator.labels]
y_true = np.array(y_true)

AttributeError: 'PrefetchDataset' object has no attribute 'labels'

In [None]:
from sklearn.metrics import f1_score

ths = np.linspace(0.1, 0.5, 10)

pd.DataFrame({
    'threshold': ths, 
    'f1-micro': [f1_score(y_true, (y_pred > th)*1., average="micro") for th in ths],
    'f1-weighted': [f1_score(y_true, (y_pred > th)*1., average="weighted") for th in ths],
    'class' : "all"
    }
)

In [None]:
from sklearn.metrics import f1_score

ths = np.linspace(0.1, 0.5, 9)

df_ths = pd.DataFrame({'threshold' : ths}
)

for cl in range(19):
    col = pd.DataFrame({f'f1-class_{cl}': [f1_score(y_true[:,cl], (y_pred[:,cl] > th)*1.) for th in ths]          
                       })
    df_ths=pd.concat([df_ths,col],axis="columns")

df_ths.style.highlight_max(color = 'lightgreen', axis = 0)
df_ths

In [None]:
argmax_index=df_ths.iloc[:,1:].idxmax(axis=0)
class_thresholds = df_ths.threshold[argmax_index].values
class_thresholds

In [None]:
f1_score(y_true, (y_pred > class_thresholds)*1., average="micro")

In [None]:
f1_score(y_true, (y_pred > class_thresholds)*1., average="weighted")

In [None]:
y_true = np.array(y_true)

In [None]:
y_true[:,3].shape

In [None]:
y_pred[:,3].shape

In [None]:
#df_genre.sort_values(by="count",ascending=False)[:7]

In [None]:
#valid_generator.class_indices.keys()

In [None]:
top_n_genre_ids = df_genre.sort_values(by="count",ascending=False)[:7]
display(top_n_genre_ids)
top_n_genre_col_pos = {i:map_gender[str(e)] for i,e in enumerate(list(valid_generator.class_indices.keys())) if e in top_n_genre_ids.values}
display(top_n_genre_col_pos)
#mask_top_n_genre_ids = [(e in top_n_genre_ids.values) for e in list(valid_generator.class_indices.keys())]
#mask_top_n_genre_ids

In [None]:
#https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
from sklearn.metrics import roc_curve, auc
from scipy import interp
import matplotlib.pyplot as plt
from itertools import cycle

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
n_classes=19
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_true[:, i], 
                                  y_pred[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_true.ravel(), y_pred.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

In [None]:
lw = 2
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += interp(all_fpr, fpr[i], tpr[i])

# Finally average it and compute AUC
mean_tpr /= n_classes

fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

# Plot all ROC curves
plt.figure(figsize=(15,15))
plt.plot(fpr["micro"], tpr["micro"],
         label='micro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["micro"]),
         color='deeppink', linestyle=':', linewidth=4)

plt.plot(fpr["macro"], tpr["macro"],
         label='macro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["macro"]),
         color='navy', linestyle=':', linewidth=4)

colors = cycle(['aqua', 'darkorange', 'cornflowerblue', 'b', 'g', 'r', 'c', 'm', 'y', 'k'])

for i, color in zip(top_n_genre_col_pos.keys(), colors):#zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=lw,
             label='ROC curve of class {0} (area = {1:0.2f})'
             #''.format(map_gender[str(i)], roc_auc[i]))
             ''.format(top_n_genre_col_pos[i], roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic to multi-class')
plt.legend(loc="lower right")
plt.show()

In [None]:
# f1 

In [None]:
df[df.filename=='wdju5uQUMy2jjeqdKroI6VklYbY.jpg']

In [None]:
!pip install -U tensorboard-plugin-profile