In [None]:
import ast
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import matplotlib.style as style
import seaborn as sns
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from hentai import Utils, Hentai, Option
from pathlib import Path

## Data Collection

We will be using the doujin dataset obtained from nhentai.

In [None]:
N_SAMPLE = 0 # Number of sample to refetch from hentai.

## Download dataset

In [None]:
samples_df = pd.DataFrame([Utils.get_random_hentai() for _ in range(0, N_SAMPLE)])
samples_df = samples_df.apply(lambda x: x.dictionary(Option.all()))
data_path = Path("data")
metadata_path = data_path / "metadata.csv"
if not metadata_path.is_file():
   samples_df.to_csv(metadata_path, index=False, header="column_names")
else:
   samples_df.to_csv(metadata_path, index=False, mode="a", header=False)
print("Number of resampled samples: ", len(samples_df))


## Read dataset file

In [None]:
converters = {
    column_name: ast.literal_eval
    for column_name in ["tag", "group", "parody", "character", "artist", "category", "image_urls"]
}
hentais_df = pd.read_csv(metadata_path, converters=converters)
hentais_df

## Download images

In [None]:
for _, hentai in hentais_df.iterrows():
    hentai_path = data_path / str(hentai.id)
    if not hentai_path.is_dir():
        hentai = Hentai(hentai.id)
        hentai.download(hentai_path, progressbar=True)

## Data preparation

In [None]:
label_freq = hentais_df["tag"].explode().value_counts().sort_values(ascending=False).head(50)

style.use("fivethirtyeight")
plt.figure(figsize=(12, 20))
sns.barplot(y=label_freq.index.values, x=label_freq, order=label_freq.index)
plt.title("Label frequency", fontsize=14)
plt.xlabel("")
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()

## Data sparsity

In [None]:

nobjs = 2 # Maximum number of images to display
ncols = 2 # Number of columns in display
nrows = nobjs // ncols # Number of rows in display
plt.figure(figsize=(14, 4 * nrows))
hentais_df["num_favorites"].plot.hist(ax=plt.subplot(nrows, ncols, 1), bins=100, title="Favorites")
hentais_df["num_pages"].plot.hist(ax=plt.subplot(nrows, ncols, 2), bins=100, title="Pages")
plt.show()

## Train / val split

We need to complete the full path to locate training and test images from the current working directory.

In [None]:
hentais_df = hentais_df.explode("image_urls").reset_index()
filenames_df = hentais_df.apply(lambda x: str(Path(str(x["id"])) / Path(x["image_urls"]).name), axis=1).rename("filename")
labels_df = hentais_df["tag"].rename("labels")
hentais_df = pd.concat([filenames_df, labels_df], axis=1)
hentais_df

Splitting the modeling data into training and validation is common in machine learning practice.
We will be allocating 80% of the images for training and 20% for validation.
Usually, we keep a final test set to communicate performance results but we will not really need it in this notebook.

In [None]:
train_df, val_df = train_test_split(hentais_df, test_size=0.2, random_state=44)
print("Number of hentais for training: ", len(train_df))
print("Number of hentais for validation: ", len(val_df))

## Image examples

In [None]:

nobjs = 8 # Maximum number of images to display
ncols = 4 # Number of columns in display
nrows = nobjs // ncols # Number of rows in display
samples = hentais_df["filename"].explode().apply(lambda x: str(data_path / x)).sample(nrows * ncols)
plt.figure(figsize=(14, 4 * nrows))
for i, img in enumerate(samples):
    ax = plt.subplot(nrows, ncols, i+1)
    ax.imshow(plt.imread(img, format="jpeg"))

## Label encoding

The original targets are lists of strings that can be easily understood by humans.
But, if we want to build and train a neural network we need to create binary labels (multi-hot encoding).
This is critical for multi-label classification.

In order to binarize our labels, we will be using scikit-learn's MultiLabelBinarizer.

In [None]:
# Fit the multi-label binarizer on the training set
mlb = MultiLabelBinarizer()
mlb.fit(hentais_df["labels"])

# Loop over all labels and show them
nlabels = len(mlb.classes_)

pd.DataFrame({"labels": mlb.classes_})

In [None]:
# transform the targets of the training and test sets
y_train_bin = mlb.transform(train_df["labels"])
y_val_bin = mlb.transform(val_df["labels"])

Let's check if everything worked correctly (We should obtain binary targets instead of list of strings).

In [None]:
# Print example of hentai tags and their binary targets
pd.DataFrame(zip(train_df["filename"], y_train_bin), columns=["filename", "labels"])

## Tensorflow DataSet

In [None]:
train_gen = keras.preprocessing.image.ImageDataGenerator()

In [None]:
BATCH_SIZE = 256 # Big enough to measure an F1-score
IMG_SIZE = 224 # Specify height and width of image to match the input format of the model

In [None]:
train_ds = train_gen.flow_from_dataframe(
    dataframe=hentais_df,
    directory="data",
    x_col="filename",
    y_col="labels",
    class_mode='categorical',
    batch_size=BATCH_SIZE,
    target_size=(IMG_SIZE, IMG_SIZE),
    shuffle=True,
    seed=44,
    subset='training'
)
val_ds = train_gen.flow_from_dataframe(
    dataframe=hentais_df,
    directory="data",
    x_col="filename",
    y_col="labels",
    class_mode='categorical',
    batch_size=BATCH_SIZE,
    target_size=(IMG_SIZE, IMG_SIZE),
    seed=44,
    subset='validation'
)

## Transfert learning feature extractor

In [None]:
cnn_model = keras.applications.InceptionV3(include_top=False, weights="imagenet", pooling="max")
cnn_model.summary()

In [None]:
model = Sequential([
    layers.InputLayer(input_shape=(IMG_SIZE, IMG_SIZE, 3)),
    cnn_model,
    layers.Dense(1024, activation="relu"),
    layers.Dropout(.5),
    layers.Dense(512, activation="relu"),
    layers.Dropout(.5),
    layers.Dense(128, activation="relu"),
    layers.Dropout(.5),
    layers.Dense(64, activation="relu"),
    layers.Dense(nlabels, activation="softmax")
])
model.summary()

## Train the model
Specify the learning rate and the number of training epochs (number of loops over the whole dataset).

In [None]:
LR = 1e-5 # Keep it small when transfer learning
EPOCHS = 30

In [None]:
model.compile(loss="categorical_crossentropy", optimizer=tf.keras.optimizers.Adam(learning_rate=LR), metrics=["accuracy"])

In [None]:
history = model.fit(train_ds, epochs=EPOCHS, validation_data=val_ds)

In [None]:
plt.plot(history.history["loss"])
plt.show()