In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from PIL import Image
from functools import reduce
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

from Tokenizer import Tokenizer
from ImageCaption import BaseModel

# Dataset

In [2]:
df = pd.read_csv("data/processed.csv")
df = df[["Image1", "Image2", "Clean_capt"]]
df.columns = ["Image1", "Image2", "Caption"]
print(df.shape)
df.head()

(4017, 3)


Unnamed: 0,Image1,Image2,Caption
0,CXR162_IM-0401-1001,CXR162_IM-0401-2001,normal chest
1,CXR1390_IM-0249-1001,CXR1390_IM-0249-2001,no evidence active disease
2,CXR604_IM-2193-1001,CXR604_IM-2193-2001,no evidence active disease
3,CXR2699_IM-1167-1001,CXR2699_IM-1167-2001,no acute cardiopulmonary disease
4,CXR2841_IM-1253-2001,CXR2841_IM-1253-2001,no acute cardiopulmonary disease


## Tokenizer

In [3]:
tokenizer = Tokenizer()
tokenizer.fit(reduce(lambda a, b: a+b, df.Caption.str.split()))
cap = df.Caption.apply(lambda x: tokenizer.transform(x.split()))
cap_len = cap.apply(len).value_counts().sort_index()
cap_len

3       10
4      109
5      469
6     1443
7      292
      ... 
65       1
66       1
86       1
89       1
90       2
Name: Caption, Length: 63, dtype: int64

In [4]:
LENGTH = 64
seq_trim = cap_len[cap_len.index > LENGTH]
print("If trimmed by length %d, %d captions will be trimmed" % (LENGTH, seq_trim.sum()))
print("If trimmed by length %d, %.2f%% of captions will be trimmed" % (LENGTH, seq_trim.sum()*100/len(df)))

If trimmed by length 64, 6 captions will be trimmed
If trimmed by length 64, 0.15% of captions will be trimmed


In [5]:
df["CaptionSeq"] = cap.apply(lambda x: x + [1] * (LENGTH - len(x)) if len(x) < LENGTH else x[:LENGTH])
# df["CaptionSeq"] = cap
df.head()

Unnamed: 0,Image1,Image2,Caption,CaptionSeq
0,CXR162_IM-0401-1001,CXR162_IM-0401-2001,normal chest,"[0, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,CXR1390_IM-0249-1001,CXR1390_IM-0249-2001,no evidence active disease,"[0, 4, 5, 6, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,CXR604_IM-2193-1001,CXR604_IM-2193-2001,no evidence active disease,"[0, 4, 5, 6, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,CXR2699_IM-1167-1001,CXR2699_IM-1167-2001,no acute cardiopulmonary disease,"[0, 4, 8, 9, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,CXR2841_IM-1253-2001,CXR2841_IM-1253-2001,no acute cardiopulmonary disease,"[0, 4, 8, 9, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


# Model

In [7]:
model = BaseModel(df, "data/Images", LENGTH, tokenizer.vocab_size)
model.train()

Epoch 0 Train Loss 3.9115920618398867 Validation Loss 1.8696891209658455
Epoch 1 Train Loss 1.390457800964811 Validation Loss 1.15913185301949
Epoch 2 Train Loss 1.0212021280283952 Validation Loss 0.9521868322409835
Epoch 3 Train Loss 0.8608942331366278 Validation Loss 0.8162363037174823
Epoch 4 Train Loss 0.7433702782611942 Validation Loss 0.7225157945763832
Epoch 5 Train Loss 0.6524087444170198 Validation Loss 0.6373912937500897
Epoch 6 Train Loss 0.5792106266935073 Validation Loss 0.5662876706497342
Epoch 7 Train Loss 0.5163621846864472 Validation Loss 0.5095310567640791
Epoch 8 Train Loss 0.4604043906452644 Validation Loss 0.4594923102972554
Epoch 9 Train Loss 0.41096538920604175 Validation Loss 0.4148065532539405
