In [38]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import re
import copy
from tqdm.notebook import tqdm
import gc

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import Dataset, DataLoader

from sklearn.metrics import (
    accuracy_score,
    f1_score,
    classification_report,
    jaccard_score
)

from transformers import (
    T5Tokenizer,
    T5Model,
    T5ForConditionalGeneration,
    get_linear_schedule_with_warmup
)

In [59]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

train_df["tags"] = train_df["tags"].apply(lambda x: " ".join(x.split("_")))

labels = ['love','life','inspirational','philosophy','humor']
label_string = ",".join(labels)

In [40]:

seed = 9
val_split = 0.1
batch_size = 8
epochs = 10
np.random.seed(seed)

dataset_size = len(train_df)
indices = list(range(dataset_size))
split = int(np.floor(val_split * dataset_size))
np.random.shuffle(indices)

train_indices, val_indices = indices[split:], indices[:split]

In [41]:
def generate_one_hot_encoding(labels, texts):
    one_hot_encodings = []

    for text in texts:
        one_hot_encoding = np.zeros(len(labels), dtype=int)

        for i, label in enumerate(labels):
            if label in text:
                one_hot_encoding[i] = 1

        one_hot_encodings.append(one_hot_encoding)

    return np.array(one_hot_encodings)

In [60]:
# created dataset class for classification

class ClassificationDataset(Dataset):
    def __init__(self, dataframe, indices, tokenizer):
        super(ClassificationDataset, self).__init__()

        df = dataframe.iloc[indices]
        df['quote'] = "classify the quote ("+label_string+"):" + df['quote']
        self.texts = df['quote'].tolist()
        self.targets = df['tags'].tolist()
        self.tokenizer = tokenizer
        self.src_max_length = 512 # based on longest quote
        self.tgt_max_length = 30

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        target = str(self.targets[idx])

        encoding = self.tokenizer.encode_plus(
            text,
            max_length=self.src_max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        target_enc = self.tokenizer.encode_plus(
            target,
            max_length=self.tgt_max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        input_ids = encoding["input_ids"].squeeze()
        attention_mask = encoding["attention_mask"].squeeze()

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "target_ids": target_enc["input_ids"].squeeze(),
            "target_attn_mask": target_enc["attention_mask"].squeeze()
        }

In [61]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = T5ForConditionalGeneration.from_pretrained("t5-small")
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [62]:

tokenizer = T5Tokenizer.from_pretrained("t5-small")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [63]:
training_data = ClassificationDataset(train_df, train_indices, tokenizer)
val_data = ClassificationDataset(train_df, val_indices, tokenizer)
train_dataloader = DataLoader(training_data, batch_size= batch_size)
validation_dataloader = DataLoader(val_data, batch_size= batch_size)

#batch = next(iter(train_dataloader))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['quote'] = "classify the quote ("+label_string+"):" + df['quote']


In [64]:
def train(model, train_dl, val_dl ,criterion, optimizer, scheduler, epochs):
    # we validate config.N_VALIDATE_DUR_TRAIN times during the training loop
    nv = 10
    temp = len(train_dataloader) // nv
    temp = temp - (temp % 100)
    validate_at_steps = [temp * x for x in range(1, nv + 1)]

    train_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader,
                                      desc='Epoch ' + str(epochs))):
        # set model.eval() every time during training
        model.train()

        # unpack the batch contents and push them to the device (cuda or cpu).
        b_src_input_ids = batch['input_ids'].to(device)
        b_src_attention_mask = batch['attention_mask'].to(device)

        lm_labels = batch['target_ids'].to(device)
        lm_labels[lm_labels[:, :] == tokenizer.pad_token_id] = -100

        b_tgt_attention_mask = batch['target_attn_mask'].to(device)

        # clear accumulated gradients
        optimizer.zero_grad()

        # forward pass
        outputs = model(input_ids=b_src_input_ids,
                        attention_mask=b_src_attention_mask,
                        labels=lm_labels,
                        decoder_attention_mask=b_tgt_attention_mask)
        loss = outputs[0]
        train_loss += loss.item()

        # backward pass
        loss.backward()

        # update weights
        optimizer.step()

        # update scheduler
        scheduler.step()

        if step in validate_at_steps:
            print(f'-- Step: {step}')
            _ = val(model, val_dl, criterion)

    avg_train_loss = train_loss / len(train_dataloader)
    print('Training loss:', avg_train_loss)

In [65]:
def val(model, val_dataloader, criterion):

    val_loss = 0
    true, pred = [], []

    # set model.eval() every time during evaluation
    model.eval()

    for step, batch in enumerate(val_dataloader):
        # unpack the batch contents and push them to the device (cuda or cpu).
        b_src_input_ids = batch['input_ids'].to(device)
        b_src_attention_mask = batch['attention_mask'].to(device)

        b_tgt_input_ids = batch['target_ids'].to(device)
        lm_labels = b_tgt_input_ids
        lm_labels[lm_labels[:, :] == tokenizer.pad_token_id] = -100

        b_tgt_attention_mask = batch['target_attn_mask'].to(device)

        # using torch.no_grad() during validation/inference is faster -
        # - since it does not update gradients.
        with torch.no_grad():
            # forward pass
            outputs = model(
                input_ids=b_src_input_ids,
                attention_mask=b_src_attention_mask,
                labels=lm_labels,
                decoder_attention_mask=b_tgt_attention_mask)
            loss = outputs[0]

            val_loss += loss.item()

            # get true
            for true_id in b_tgt_input_ids:
                try:
                  true_decoded = tokenizer.decode(true_id)
                except:
                  # print(b_tgt_input_ids)
                  # print(true_id)
                  true_decoded = ""
                true.append(true_decoded)

            # get pred (decoder generated textual label ids)
            pred_ids = model.generate(
                input_ids=b_src_input_ids,
                attention_mask=b_src_attention_mask
            )
            pred_ids = pred_ids.cpu().numpy()
            for pred_id in pred_ids:
                pred_decoded = tokenizer.decode(pred_id)
                pred.append(pred_decoded)

    true_ohe = generate_one_hot_encoding(labels, true)
    pred_ohe = generate_one_hot_encoding(labels, pred)

    avg_val_loss = val_loss / len(val_dataloader)
    print('Val loss:', avg_val_loss)
    print('Val Jaccard Score:', jaccard_score(true_ohe, pred_ohe, average="samples"))

    val_micro_f1_score = f1_score(true_ohe, pred_ohe, average='macro')
    print('Val micro f1 score:', val_micro_f1_score)
    return val_micro_f1_score


In [66]:
def run():
    # setting a seed ensures reproducible results.
    # seed may affect the performance too.
    torch.manual_seed(seed)

    criterion = nn.BCEWithLogitsLoss()

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]
    optimizer = optim.AdamW(optimizer_parameters, lr=3e-4)

    num_training_steps = len(train_dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps
    )

    max_val_micro_f1_score = float('-inf')
    for epoch in range(epochs):
        train(model, train_dataloader, validation_dataloader, criterion, optimizer, scheduler, epoch)
        val_micro_f1_score = val(model, validation_dataloader, criterion)

        if True:
            if val_micro_f1_score > max_val_micro_f1_score:
                best_model = copy.deepcopy(model)
                best_val_micro_f1_score = val_micro_f1_score

                model_name = 't5_best_model'
                torch.save(best_model.state_dict(), model_name + '.pt')

                print(f'--- Best Model. Val loss: {max_val_micro_f1_score} -> {val_micro_f1_score}')
                max_val_micro_f1_score = val_micro_f1_score

    return best_model, best_val_micro_f1_score

In [67]:

torch.cuda.empty_cache()


torch.cuda.memory_summary(device=None, abbreviated=False)




In [68]:
torch.cuda.empty_cache()
best_model, best_val_micro_f1_score = run()


Epoch 0:   0%|          | 0/999 [00:00<?, ?it/s]

-- Step: 0




Val loss: 7.737827893849966
Val Jaccard Score: 0.0
Val micro f1 score: 0.0


  _warn_prf(average, modifier, msg_start, len(result))


Training loss: 0.9753205355282899




Val loss: 0.7503350844254365
Val Jaccard Score: 0.0
Val micro f1 score: 0.0
--- Best Model. Val loss: -inf -> 0.0


Epoch 1:   0%|          | 0/999 [00:00<?, ?it/s]

-- Step: 0




Val loss: 0.7518877514579274
Val Jaccard Score: 0.0
Val micro f1 score: 0.0
Training loss: 0.7696319382798087




Val loss: 0.7363223534178089
Val Jaccard Score: 0.0
Val micro f1 score: 0.0


Epoch 2:   0%|          | 0/999 [00:00<?, ?it/s]

-- Step: 0




Val loss: 0.7369727742027592
Val Jaccard Score: 0.0
Val micro f1 score: 0.0
Training loss: 0.7047640935436741




Val loss: 0.7480050340160593
Val Jaccard Score: 0.0
Val micro f1 score: 0.0


Epoch 3:   0%|          | 0/999 [00:00<?, ?it/s]

-- Step: 0




Val loss: 0.7493513139921266
Val Jaccard Score: 0.0
Val micro f1 score: 0.0
Training loss: 0.6315981273626065




Val loss: 0.7704142301200746
Val Jaccard Score: 0.0
Val micro f1 score: 0.0


Epoch 4:   0%|          | 0/999 [00:00<?, ?it/s]

-- Step: 0




Val loss: 0.7718260486249451
Val Jaccard Score: 0.0
Val micro f1 score: 0.0
Training loss: 0.5517473799419833




Val loss: 0.8151876838491844
Val Jaccard Score: 0.0
Val micro f1 score: 0.0


Epoch 5:   0%|          | 0/999 [00:00<?, ?it/s]

-- Step: 0




Val loss: 0.8170675587009739
Val Jaccard Score: 0.0
Val micro f1 score: 0.0
Training loss: 0.478772453859344




Val loss: 0.8622467262116639
Val Jaccard Score: 0.0
Val micro f1 score: 0.0


Epoch 6:   0%|          | 0/999 [00:00<?, ?it/s]

-- Step: 0




Val loss: 0.8646315781248582
Val Jaccard Score: 0.0
Val micro f1 score: 0.0
Training loss: 0.41762603741060866




Val loss: 0.9507568086589779
Val Jaccard Score: 0.0
Val micro f1 score: 0.0


  _warn_prf(average, modifier, msg_start, len(result))


Epoch 7:   0%|          | 0/999 [00:00<?, ?it/s]

-- Step: 0




Val loss: 0.9511017481217513
Val Jaccard Score: 0.0
Val micro f1 score: 0.0


  _warn_prf(average, modifier, msg_start, len(result))


Training loss: 0.36051956756978304




Val loss: 1.0078196848432224
Val Jaccard Score: 0.0
Val micro f1 score: 0.0


  _warn_prf(average, modifier, msg_start, len(result))


Epoch 8:   0%|          | 0/999 [00:00<?, ?it/s]

-- Step: 0




Val loss: 1.0092577809417569
Val Jaccard Score: 0.0
Val micro f1 score: 0.0


  _warn_prf(average, modifier, msg_start, len(result))


Training loss: 0.31950815968043933




Val loss: 1.0484820852005803
Val Jaccard Score: 0.0
Val micro f1 score: 0.0


  _warn_prf(average, modifier, msg_start, len(result))


Epoch 9:   0%|          | 0/999 [00:00<?, ?it/s]

-- Step: 0




Val loss: 1.0482434298138361
Val Jaccard Score: 0.0
Val micro f1 score: 0.0


  _warn_prf(average, modifier, msg_start, len(result))


Training loss: 0.2915868746364737




Val loss: 1.0705407723650202
Val Jaccard Score: 0.0
Val micro f1 score: 0.0


  _warn_prf(average, modifier, msg_start, len(result))


In [51]:
torch.save(model.state_dict(), 't5-ft' + '.pt')

In [69]:
dataset_size = len(test_df)
test_indices = list(range(dataset_size))

test_data = ClassificationDataset(test_df, test_indices, tokenizer)
test_dataloader = DataLoader(test_data, batch_size=8)

In [70]:
pred = []
def predict(model):
    val_loss = 0

    model.eval()
    for step, batch in tqdm(enumerate(test_dataloader), total=len(test_dataloader)):
        b_src_input_ids = batch['input_ids'].to(device)
        b_src_attention_mask = batch['attention_mask'].to(device)

        with torch.no_grad():
            # get pred
            pred_ids = model.generate(
                input_ids=b_src_input_ids,
                attention_mask=b_src_attention_mask
            )
            pred_ids = pred_ids.cpu().numpy()
            for pred_id in pred_ids:
                pred_decoded = tokenizer.decode(pred_id)
                pred.append(pred_decoded)
    print(pred)
    pred_ohe = generate_one_hot_encoding(labels,pred)
    return pred_ohe

pred_ohe = predict(model)

  0%|          | 0/278 [00:00<?, ?it/s]



['<pad> philosophy</s>', '<pad> life</s>', '<pad> inspirational</s>', '<pad> inspirational</s>', '<pad> love</s>', '<pad> love</s>', '<pad> life</s>', '<pad> life</s>', '<pad> humor</s>', '<pad> love</s>', '<pad> life</s>', '<pad> humor</s>', '<pad> love</s>', '<pad> inspirational</s>', '<pad> inspirational</s>', '<pad> inspirational</s>', '<pad> love</s><pad><pad><pad>', '<pad> humor</s><pad><pad><pad>', '<pad> inspirational</s><pad><pad><pad>', '<pad> inspirational philosophy life love</s>', '<pad> inspirational philosophy</s><pad><pad>', '<pad> inspirational</s><pad><pad><pad>', '<pad> philosophy</s><pad><pad><pad>', '<pad> inspirational</s><pad><pad><pad>', '<pad> love</s><pad><pad><pad>', '<pad> inspirational philosophy life love</s>', '<pad> life</s><pad><pad><pad>', '<pad> love</s><pad><pad><pad>', '<pad> inspirational</s><pad><pad><pad>', '<pad> love</s><pad><pad><pad>', '<pad> love</s><pad><pad><pad>', '<pad> life</s><pad><pad><pad>', '<pad> life</s>', '<pad> love</s>', '<pad>

In [71]:
def get_output_tags(out):
  tags = []
  for i in labels:
    if i in out:
      tags.append(i)
  return tags


test_df['pred_tags'] = pred
test_df['pred_tags'] = test_df['pred_tags'].apply(get_output_tags)
pred_ohe = generate_one_hot_encoding(labels,pred)
pred_ohe

array([[0, 0, 0, 1, 0],
       [0, 1, 0, 0, 0],
       [0, 0, 1, 0, 0],
       ...,
       [0, 1, 0, 0, 0],
       [1, 1, 1, 1, 1],
       [1, 1, 1, 1, 0]])

In [None]:
test_df.to_csv('submission.csv', index=False)

In [72]:
test_df

Unnamed: 0,quote,tags,pred_tags
0,we are shocked by thieves taking pride in thei...,philosophy_life,[philosophy]
1,discovering life in oneself is to become what ...,life,[life]
2,this is what the good guys do they keep trying...,inspirational,[inspirational]
3,fourth cosmic sealterrestrial seal 1330 seal ...,humor_inspirational_philosophy_life,[inspirational]
4,we are figments of the same imagination we are...,inspirational,[love]
...,...,...,...
2216,when you change your thinking you change the w...,inspirational_philosophy_life_love,"[life, philosophy]"
2217,love is a combination of virtues the amount yo...,love,[love]
2218,with my talent i can make people laugh and giv...,life,[life]
2219,construct your life plans before satan attempt...,life,"[love, life, inspirational, philosophy, humor]"


In [73]:
def jaccard_similarity(one_hot_true, one_hot_predicted):
    # Compute intersection and union
    intersection = np.sum(np.logical_and(one_hot_true, one_hot_predicted), axis=1)
    union = np.sum(np.logical_or(one_hot_true, one_hot_predicted), axis=1)

    # Compute Jaccard Similarity for each instance
    jaccard_similarities = intersection / union

    return jaccard_similarities

true_ohe = generate_one_hot_encoding(labels, test_df['tags'].tolist())
pred_ohe = generate_one_hot_encoding(labels, pred)

jaccard_similarities = jaccard_similarity(true_ohe, pred_ohe)

In [74]:
jaccard_similarities.mean()

0.5924133273300315

In [None]:
jaccard_score(true_ohe, pred_ohe, average="samples")