In [None]:
import os
import ast
import re
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import matplotlib.style as style
import seaborn as sns
import tensorflow_hub as hub
from textwrap import wrap
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from hentai import Utils, Hentai, Option
from pathlib import Path

## Data Collection

We will be using the doujin dataset obtained from nhentai.

In [None]:
N_SAMPLE = 0 # Number of sample to refetch from hentai.

## Download dataset

In [None]:
hentais = [Utils.get_random_hentai() for _ in range(0, N_SAMPLE)]
hentais = pd.DataFrame([hentai.dictionary(Option.all()) for hentai in hentais])
if not os.path.isfile('data/metadata.csv'):
   hentais.to_csv('data/metadata.csv', index=False, header='column_names')
else:
   hentais.to_csv('data/metadata.csv', index=False, mode='a', header=False)
print("Number of resampled hentais: ", len(hentais))


## Read dataset file

In [None]:
converters = {
    column_name: ast.literal_eval
    for column_name in ["tag", "group", "parody", "character", "artist", "category", "image_urls"]
}
hentais = pd.read_csv("data/metadata.csv", converters=converters)
hentais

## Download images

In [None]:
for _, hentai in hentais.iterrows():
    path = Path("data") / str(hentai.id)
    if not os.path.isdir(path):
        hentai = Hentai(hentai.id)
        hentai.download(path, progressbar=True)

## Data preparation

In [None]:
label_freq = hentais['tag'].explode().value_counts().sort_values(ascending=False).head(50)

style.use("fivethirtyeight")
plt.figure(figsize=(12, 20))
sns.barplot(y=label_freq.index.values, x=label_freq, order=label_freq.index)
plt.title("Label frequency", fontsize=14)
plt.xlabel("")
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()

## Data sparsity

In [None]:

nobjs = 2 # Maximum number of images to display
ncols = 2 # Number of columns in display
nrows = nobjs // ncols # Number of rows in display
plt.figure(figsize=(14, 4 * nrows))
hentais["num_favorites"].plot(ax=plt.subplot(nrows, ncols, 1), title="Favorites")
hentais["num_pages"].plot(ax=plt.subplot(nrows, ncols, 2), title="Pages")
plt.show()

## Train / val split

We need to complete the full path to locate training and test images from the current working directory.


In [None]:
image_url_re = re.compile(r"https://i.nhentai.net/galleries/\d+")
filenames = hentais.apply(lambda x: [image_url_re.sub(os.path.join("data", str(x["id"])), img) for img in x["image_urls"]], axis=1).rename("filenames")
labels = hentais["tag"].rename("labels")
pd.concat([filenames, labels], axis=1)

Splitting the modeling data into training and validation is common in machine learning practice.
We will be allocating 80% of the images for training and 20% for validation.
Usually, we keep a final test set to communicate performance results but we will not really need it in this notebook.

In [None]:
X_train, X_val, y_train, y_val = train_test_split(filenames, labels, test_size=0.2, random_state=44)
print("Number of posters for training: ", len(X_train))
print("Number of posters for validation: ", len(X_val))

## Image examples

In [None]:

nobjs = 8 # Maximum number of images to display
ncols = 4 # Number of columns in display
nrows = nobjs // ncols # Number of rows in display
samples = X_train.explode().sample(nrows * ncols)
plt.figure(figsize=(14, 4 * nrows))
for i, img in enumerate(samples):
    ax = plt.subplot(nrows, ncols, i+1)
    ax.imshow(plt.imread(img, format="jpeg"))

## Label encoding

The original targets are lists of strings that can be easily understood by humans.
But, if we want to build and train a neural network we need to create binary labels (multi-hot encoding).
This is critical for multi-label classification.

In order to binarize our labels, we will be using scikit-learn's MultiLabelBinarizer.

In [None]:
# Fit the multi-label binarizer on the training set
mlb = MultiLabelBinarizer()
mlb.fit(hentais["tag"])

# Loop over all labels and show them
nlabels = len(mlb.classes_)

pd.DataFrame({"labels": mlb.classes_})

In [None]:
# transform the targets of the training and test sets
y_train_bin = mlb.transform(y_train)
y_val_bin = mlb.transform(y_val)

Let's check if everything worked correctly (We should obtain binary targets instead of list of strings).

In [None]:
# Print example of movie posters and their binary targets
pd.DataFrame(zip(X_train, y_train_bin), columns=["filename", "labels"])

## Tensorflow DataSet

In [None]:
IMG_FRAMES = 15 # Number of image sequence to feed into the ResNet
IMG_SIZE = 224 # Specify height and width of image to match the input format of the model
CHANNELS = 3 # Keep RGB color channels to match the input format of the model

In [None]:
def parse_function(filename, label):
    """Function that returns a tuple of normalized image array and labels array.
    Args:
        filename: string representing path to image
        label: 0/1 one-dimensional array of size N_LABELS
    """
    # Read an image from a file
    image_string = tf.io.read_file(filename)
    # Decode it into a dense vector
    image_decoded = tf.image.decode_jpeg(image_string, channels=CHANNELS)
    # Resize it to fixed shape
    image_resized = tf.image.resize(image_decoded, [IMG_SIZE, IMG_SIZE])
    # Normalize it from [0, 255] to [0.0, 1.0]
    image_normalized = image_resized / 255.0
    return image_normalized, label

In [None]:
BATCH_SIZE = 256 # Big enough to measure an F1-score
AUTOTUNE = tf.data.experimental.AUTOTUNE # Adapt preprocessing and prefetching dynamically
SHUFFLE_BUFFER_SIZE = 1024 # Shuffle the training data by a chunck of 1024 observations

In [None]:
def create_dataset(filenames, labels, is_training=True):
    """Load and parse dataset.
    Args:
        filenames: list of image paths
        labels: numpy array of shape (BATCH_SIZE, N_LABELS)
        is_training: boolean to indicate training mode
    """
    filenames = filenames.apply(lambda x: x[1])
    # Create a first dataset of file paths and labels
    dataset = tf.data.Dataset.from_tensor_slices((filenames, labels))
    # Parse and preprocess observations in parallel
    dataset = dataset.map(parse_function, num_parallel_calls=AUTOTUNE)
    
    if is_training == True:
        # This is a small dataset, only load it once, and keep it in memory.
        dataset = dataset.cache()
        # Shuffle the data each buffer size
        dataset = dataset.shuffle(buffer_size=SHUFFLE_BUFFER_SIZE)
        
    # Batch the data for multiple steps
    dataset = dataset.batch(BATCH_SIZE)
    # Fetch batches in the background while the model is training.
    dataset = dataset.prefetch(buffer_size=AUTOTUNE)
    
    return dataset

In [None]:

train_ds = create_dataset(X_train, y_train_bin)
val_ds = create_dataset(X_val, y_val_bin)

## Transfert learning feature extractor

In [None]:
cnn_model = keras.applications.InceptionV3(include_top=False, weights="imagenet", pooling="max")
cnn_model.summary()

In [None]:
model = tf.keras.Sequential([
    layers.InputLayer(input_shape=(IMG_SIZE, IMG_SIZE, CHANNELS)),
    cnn_model,
    # layers.TimeDistributed(cnn_model),
    # layers.GRU(64),
    layers.Dense(1024, activation="relu"),
    layers.Dropout(.5),
    layers.Dense(512, activation="relu"),
    layers.Dropout(.5),
    layers.Dense(128, activation="relu"),
    layers.Dropout(.5),
    layers.Dense(64, activation="relu"),
    layers.Dense(nlabels, activation="softmax")
])
model.summary()

## Train the model
Specify the learning rate and the number of training epochs (number of loops over the whole dataset).

In [None]:
LR = 1e-5 # Keep it small when transfer learning
EPOCHS = 30

In [None]:
model.compile(loss="categorical_crossentropy", optimizer=tf.keras.optimizers.Adam(learning_rate=LR), metrics=["accuracy"])

In [None]:
history = model.fit(train_ds, epochs=EPOCHS, validation_data=val_ds)

In [None]:
plt.plot(history.history["loss"])
plt.show()