In [1]:
import numpy as np
import os
import joblib
import torch
from sklearn import preprocessing
from word2vec_util import TextCorpusProcess
from skip_gram_neg import SkipGramNeg
import time
import copy
from torch import Tensor, nn
import datetime
from torch.utils.tensorboard import SummaryWriter
import torch.optim as optim
from torch.optim import lr_scheduler
import os
import datetime

In [2]:
# save train label data for easy load
train_pklname = os.path.join('/notebooks/data/', 'img_prepared_train.pkl')
train_data = joblib.load(train_pklname)
train_text_label = train_data['label']
joblib.dump(train_text_label, os.path.join(
    '/notebooks/data/' + 'train_txt_labels.pkl'))

# save val label data for easy load
val_pklname = os.path.join('/notebooks/data/', 'img_prepared_val.pkl')
val_data = joblib.load(val_pklname)
val_text_label = val_data['label']
joblib.dump(val_text_label, os.path.join(
    '/notebooks/data/' + 'val_txt_labels.pkl'))

# save val desc data for easy load
val_text_descs = val_data['desc']
joblib.dump(val_text_descs, os.path.join(
    '/notebooks/data/' + 'val_txt_descs.pkl'))

['/notebooks/data/val_txt_descs.pkl']

In [3]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.


True

In [4]:
# load saved train and val labels
train_pklname = os.path.join('/notebooks/data', 'train_txt_labels.pkl')
train_txt_labels = joblib.load(train_pklname)

val_pklname = os.path.join('/notebooks/data', 'val_txt_labels.pkl')
val_txt_labels = joblib.load(val_pklname)

# load the saved processed text data
text_path = os.path.join('/notebooks/data', 'text_processed.pkl')
traintextCorpus = joblib.load(text_path)
train_descs_idx = traintextCorpus.prod_word_idx_lists

# load saved val description data
val_text_descs = joblib.load(os.path.join(
    '/notebooks/data/', 'val_txt_descs.pkl'))
valtextCorpus = TextCorpusProcess(prod_descs=val_text_descs)
val_descs = valtextCorpus.prod_word_lists

# load the word2vec model
# hyperparams
EMBED_DIM = 128
BATCH_SIZE = 16
NEG_SAMPLE_SIZE = 5

# instantiating the model
model = SkipGramNeg(
    vocab_size=traintextCorpus.vocab_size,
    embed_dim=EMBED_DIM,
    neg_sample_size=NEG_SAMPLE_SIZE,
    batch_size=BATCH_SIZE,
)
# load saved word2vec model
embedding_file = os.path.join(
    '/notebooks/models', 'best_word2vec_ebedding_model.pt')
model.load_state_dict(torch.load(embedding_file))
embeddings = model.in_embed
embeddings.requires_grad_(False)

# prepare embedded train data
train_words_emb = []
for desc_idxs in train_descs_idx:
    words_idx = torch.from_numpy(np.array(desc_idxs).reshape(-1, 1))
    words_emb = embeddings(words_idx).squeeze()
    words_emb = torch.concat([emb for emb in words_emb])
    train_words_emb.append(words_emb)

# prepare embedded val data
# ingnores words not in training data
val_descs_idx = []
for desc in val_descs:
    desc_idxs = []
    for word in desc:
        word_idx = traintextCorpus.word_to_idx.get(word)
        if not word_idx is None:
            desc_idxs.append(word_idx)
    if not desc_idxs:
        raise ValueError('Empty description!')
    val_descs_idx.append(desc_idxs)

# prepare embedded val data
val_words_emb = []
for desc_idxs in val_descs_idx:
    words_idx = torch.from_numpy(np.array(desc_idxs).reshape(-1, 1))
    words_emb = embeddings(words_idx).squeeze()
    words_emb = torch.concat([emb for emb in words_emb])
    val_words_emb.append(words_emb)

# create ordinal encoded train target
label_train = np.array(train_txt_labels).reshape(-1, 1)
target_enc = preprocessing.OrdinalEncoder()
target_enc.fit(label_train)
target_train = target_enc.transform(label_train)
target_train = target_train.reshape(-1)

# create ordinal encoded val target
label_val = np.array(val_txt_labels).reshape(-1, 1)
target_val = target_enc.transform(label_val)
target_val = target_val.reshape(-1)

In [5]:
class TextEmbedDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, labels, max_word_len, embd_dim):
        self.embd_dim = embd_dim
        # expects word embeddings
        self.inputs = inputs
        # expects ordinal encoded classes
        self.labels = labels
        self.max_word_len = max_word_len

    def __getitem__(self, index):
        words = self.inputs[index]
        pad_len = self.max_word_len * self.embd_dim - len(words)
        assert(pad_len >= 0)
        pad_len_r = pad_len // 2
        pad_len_l = pad_len - pad_len_r
        pad_r = torch.zeros(pad_len_r)
        pad_l = torch.zeros(pad_len_l)
        words_padded = torch.concat([pad_l, words, pad_r])
        # words_padded = torch.transpose(words_padded,0, 1)
        label = self.labels[index]
        return words_padded, label

    def __len__(self):
        return len(self.labels)

def train_model(model, criterion, optimizer, scheduler, writer, num_epochs=50, save_path=None):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print(f'Epoch {epoch}/{num_epochs - 1}')
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.unsqueeze(1)
                inputs = inputs.to(device)
                labels = labels.type(torch.LongTensor)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
            if phase == 'train':
                scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

            writer.add_scalar(f'Loss/{phase}', epoch_loss, epoch)
            writer.add_scalar(f'Accuracy/{phase}', epoch_acc, epoch)

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
                torch.save(best_model_wts, os.path.join(
                    save_path, 'best_text_cnn_model.pt'))

        print()
        writer.flush()

    time_elapsed = time.time() - since
    print(
        f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
    print(f'Best val Acc: {best_acc:4f}')

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model


In [6]:
# define train and val dataset

# find max description word length
max_train_desc_len = max(map(len, train_descs_idx))
max_val_desc_len = max(map(len, val_descs_idx))
max_desc_len = max_train_desc_len
if max_val_desc_len > max_train_desc_len:
    max_desc_len = max_val_desc_len

trainEmbedDataset = TextEmbedDataset(
    train_words_emb, target_train, max_desc_len, embd_dim=EMBED_DIM)
valEmbedDataset = TextEmbedDataset(
    val_words_emb, target_val, max_desc_len, embd_dim=EMBED_DIM)

text_datasets = {'train': trainEmbedDataset, 'val': valEmbedDataset}
dataset_sizes = {x: len(text_datasets[x]) for x in ['train', 'val']}

In [7]:
log_dir_path = os.path.join('/notebooks', 'runs')
save_dir_path = os.path.join('/notebooks', 'models')
try:
    os.makedirs(log_dir_path)
except:
    pass
try:
    os.makedirs(save_dir_path)
except:
    pass
# %load_ext tensorboard
# %tensorboard --logdir {log_dir_path}


In [27]:
class TextClassification(nn.Module):
    def __init__(self, word_embd_dim, word_kernel_size=2):
        super().__init__()
        self.word_embd_dim = word_embd_dim
        self.word_kernel_size = word_kernel_size
        self.cnn_layers = nn.Sequential(
            nn.Conv1d(in_channels=1,
                      out_channels=32,
                      kernel_size=self.word_kernel_size * EMBED_DIM,
                      stride=EMBED_DIM,
                      padding=(self.word_kernel_size - 1) * EMBED_DIM),
            # nn.ReLU(),
            nn.Tanh(),
            # nn.MaxPool1d(3, 3),
            # nn.Conv1d(in_channels=32,
            #           out_channels=8,
            #           kernel_size=2,
            #           stride=1,
            #           padding=1),
            # nn.ReLU(),
            # nn.MaxPool1d(3, 3),
            nn.Flatten(),
        )
        self.linear_layer = nn.Sequential(
                                nn.Linear(in_features=42272, out_features=13),
                                # nn.Tanh(),
                                # nn.Linear(in_features=2048, out_features=13),
                            )
        
        
        
        # self.linear_layer = nn.Linear(in_features=3408, out_features=13)

    def forward(self, inputs):
        x = self.cnn_layers(inputs)
        return self.linear_layer(x)

In [28]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# log dir for storing tensorboard files
log_dir = os.path.join(
    log_dir_path, datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
writer = SummaryWriter(log_dir=log_dir)

model_ft = TextClassification(word_embd_dim=EMBED_DIM, word_kernel_size=3)
model_ft = model_ft.to(device)

criterion = nn.CrossEntropyLoss()

# SGD is the best optimizer
optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)

# 16 is the best batch size
BATCH_SIZE = 16

# create a dict of train and val dataloaders
dataloaders = {x: torch.utils.data.DataLoader(text_datasets[x], batch_size=BATCH_SIZE,
                                              shuffle=True, num_workers=4)
               for x in ['train', 'val']}

# train the model
model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler, writer,
                       num_epochs=25, save_path=save_dir_path)

# experiments
# 2022-06-19_18-44-01: 
# nn.Conv1d(in_channels=1,
#                       out_channels=8,
#                       kernel_size=self.word_kernel_size * EMBED_DIM,
#                       stride=EMBED_DIM,
#                       padding=(self.word_kernel_size - 1) * EMBED_DIM),
#             nn.ReLU(),
#             nn.Flatten()
#         )
# nn.Linear(in_features=10560, out_features=13)
# optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)
# exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)
# BATCH_SIZE = 16

# 2022-06-19_19-18-03:
# + nn.MaxPool1d(3, 3),

#2022-06-19_19-42-43:
# - nn.MaxPool1d(3, 3),
# + ut_channels=16

#2022-06-19_19-49-02:
# + ut_channels=32

#2022-06-19_20-04-31:
# + word_kernel_size = 3

#2022-06-19_20-10-39:
# + nn.Conv1d(in_channels=32,
        # out_channels = 8,
        # kernel_size = 2,
        # stride = 1,
        # padding = 1),
    # nn.ReLU(),

#2022-06-19_20-18-00:
# - nn.Conv1d(in_channels=32,
# out_channels = 8,
# kernel_size = 2,
# stride = 1,
# padding = 1),
# nn.ReLU(),
# - nn.ReLU(), (first CNN layer)
# + nn.Tanh(), (first CNN layer)


Epoch 0/24
----------
train Loss: 2.4190 Acc: 0.2043
val Loss: 2.2811 Acc: 0.2688

Epoch 1/24
----------
train Loss: 1.9478 Acc: 0.4329
val Loss: 1.9449 Acc: 0.3696

Epoch 2/24
----------
train Loss: 1.5796 Acc: 0.5625
val Loss: 1.7521 Acc: 0.4279

Epoch 3/24
----------
train Loss: 1.3235 Acc: 0.6419
val Loss: 1.6449 Acc: 0.4708

Epoch 4/24
----------
train Loss: 1.1349 Acc: 0.7013
val Loss: 1.5857 Acc: 0.4740

Epoch 5/24
----------
train Loss: 0.9850 Acc: 0.7425
val Loss: 1.5661 Acc: 0.4899

Epoch 6/24
----------
train Loss: 0.8603 Acc: 0.7853
val Loss: 1.5484 Acc: 0.4919

Epoch 7/24
----------
train Loss: 0.7666 Acc: 0.8221
val Loss: 1.5431 Acc: 0.4938

Epoch 8/24
----------
train Loss: 0.7554 Acc: 0.8252
val Loss: 1.5419 Acc: 0.4978

Epoch 9/24
----------
train Loss: 0.7455 Acc: 0.8296
val Loss: 1.5417 Acc: 0.5002

Epoch 10/24
----------
train Loss: 0.7355 Acc: 0.8353
val Loss: 1.5410 Acc: 0.5006

Epoch 11/24
----------
train Loss: 0.7261 Acc: 0.8375
val Loss: 1.5419 Acc: 0.4978

Ep