In [1]:
import torch
from torch import nn
from transformers import BertTokenizer, BertModel
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
import json
import nltk
import string
from nltk.corpus import stopwords

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_units, num_layers, output_size):
        super(RNN, self).__init__()
        self.encoder = nn.LSTM(input_size=input_size, hidden_size=hidden_units, num_layers=num_layers)
    def forward(self, x):
        print(x.shape)
        return self.encoder(x)

In [3]:
model = RNN(768, 512, 2, 768)
model

RNN(
  (encoder): LSTM(768, 512, num_layers=2)
)

In [4]:
loss_fn = nn.MSELoss()
optimizer = torch.optim.SGD(params=model.parameters(), lr=0.01)

In [5]:
def train(model: nn.Module,
        loss_fn: nn.Module,
        optimizer: torch.optim.Optimizer,
        dataloader: torch.utils.data.DataLoader,
        device="cpu"
    ):
    model.train()
    train_loss, accuracy = 0, 0
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)
        y_pred = model(X)
        loss = loss_fn(y_pred, y)
        train_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        y_pred_class = torch.argmax(torch.softmax(y_pred, dim=1), dim=1)
        accuracy += (y_pred_class == y).sum().item()/len(y_pred)
    train_loss = train_loss/len(dataloader)
    accuracy = accuracy/len(dataloader)
    return train_loss, accuracy

In [6]:
def test(model: nn.Module,
        loss_fn: nn.Module,
        dataloader: torch.utils.data.DataLoader,
        device="cpu"
    ):
    model.eval()
    test_loss, accuracy = 0, 0
    with torch.inference_mode():
        for batch, (X, y) in enumerate(dataloader):
            X, y = X.to(device), y.to(device)
            y_pred = model(X)
            loss = loss_fn(y_pred, y)
            test_loss += loss.item()
            
            y_pred_class = torch.argmax(torch.softmax(y_pred, dim=1), dim=1)
            accuracy += (y_pred_class == y).sum().item()/len(y_pred)
    test_loss = test_loss/len(dataloader)
    accuracy = accuracy/len(dataloader)
    return test_loss, accuracy

In [7]:
def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    y = []
    for i in text:
        if i.isalnum():
            y.append(i)
    text = y[:]
    y.clear()
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
    return " ".join(y)

In [8]:
def load_data(path):
    file = open(path, encoding='utf8')
    data = json.load(file)
    X = []
    y = []
    for d in data:
        X.append(d['dialogue'])
        y.append(d['summary'])
    file.close()
    return X, y

X_train, y_train = load_data('./data/train.json')
X_val, y_val = load_data('./data/val.json')
X_test, y_test = load_data('./data/test.json')

In [9]:
# X_train = [transform_text(x) for x in X_train]
# X_val = [transform_text(x) for x in X_val]
# X_test = [transform_text(x) for x in X_test]
# y_train = [transform_text(y) for y in y_train]
# y_val = [transform_text(y) for y in y_val]
# y_test = [transform_text(y) for y in y_test]

In [10]:
import pickle

# with open('data.pkl', 'wb') as file:
#     pickle.dump([X_train, X_val, X_test, y_train, y_val, y_test], file)

with open('data.pkl', 'rb') as file:
    data = pickle.load(file)

[X_train, X_val, X_test, y_train, y_val, y_test] = data

In [None]:
def bert_text_preparation(self, text, tokenizer):
        marked_text = "[CLS] " + text + " [SEP]"
        tokenized_text = tokenizer.tokenize(marked_text)
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        segment_ids = [1]*len(indexed_tokens)
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensor = torch.tensor([segment_ids])
        return tokenized_text, tokens_tensor, segments_tensor

In [None]:
def convert_to_embedding(x: list) -> list:
    output = []
    model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    return output

In [11]:
class MyDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
        self.length = len(X)
        self.model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    def bert_text_preparation(self, text, tokenizer):
        marked_text = "[CLS] " + text + " [SEP]"
        tokenized_text = tokenizer.tokenize(marked_text)
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        segment_ids = [1]*len(indexed_tokens)
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensor = torch.tensor([segment_ids])
        return tokenized_text, tokens_tensor, segments_tensor

    def get_bert_embeddings(self, tokens_tensor, segments_tensor, model):
        with torch.no_grad():
            outputs = model(tokens_tensor, segments_tensor)
            hidden_states = outputs[2][1:]
        token_embeddings = hidden_states[-1]
        token_embeddings = torch.squeeze(token_embeddings, dim=0)
        list_token_embeddings = [token_embed.tolist() for token_embed in token_embeddings]
        return list_token_embeddings
    def __len__(self):
        return self.length
    def __getitem__(self, idx):
        xtokenized_text, xtokens_tensor, xsegments_tensors = self.bert_text_preparation(self.X[idx], self.tokenizer)
        xlist_token_embeddings = self.get_bert_embeddings(xtokens_tensor, xsegments_tensors, self.model)
        X_tensor = []
        for word in self.X[idx]:
            try:
                xword_index = xtokenized_text.index(word)
                X_tensor.append(xlist_token_embeddings[xword_index])
            except:
                pass
        ytokenized_text, ytokens_tensor, ysegments_tensors = self.bert_text_preparation(self.y[idx], self.tokenizer)
        ylist_token_embeddings = self.get_bert_embeddings(ytokens_tensor, ysegments_tensors, self.model)
        y_tensor = []
        for word in self.y[idx]:
            try:
                yword_index = ytokenized_text.index(word)
                y_tensor.append(ylist_token_embeddings[yword_index])
            except:
                pass
        print(word)
        return torch.tensor(X_tensor).to(torch.float32), torch.tensor(y_tensor).to(torch.float32)

In [12]:
train_dataloader = DataLoader(MyDataset(X_train, y_train), batch_size=32, shuffle=True)
val_dataloader = DataLoader(MyDataset(X_val, y_val), batch_size=32, shuffle=True)
test_dataloader = DataLoader(MyDataset(X_test, y_test), batch_size=32, shuffle=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transfo

In [13]:
# next(iter(test_dataloader))

In [14]:
epochs = 5
for epoch in range(epochs):
    train_loss, train_accuracy = train(model, loss_fn, optimizer, train_dataloader)
    val_loss, val_accuracy = test(model, loss_fn, val_dataloader)
    print(f"Epoch: {epoch} | Train Loss: {train_loss} | Train Accuracy: {train_accuracy} | Val Loss: {val_loss} | Val Accuracy: {val_accuracy}")

eva hey goin painting daniel zoe awesome thanks like machine haha went norway guess eva asking something else zoe date super cool nice eva zoe got class eva sex though haha zoe knows eva hahaa zoe hjmusps wont fight eva zoe haha eva zoe humps eva haha ok zoe date daniel went well sex
x
jessica someone take care baby two hours amanda today jessica yes afternoon pamela yes pleasure pamela take care jessica baby two hours afternoon
n
alex could text pic took phone yesterday alex one center group alex want make new facebook profile pic kim get angry kim deleted mistake alex lol worry alex sure someone else alex wants set picture took profile picture facebook kim already deleted mistake
e
kenna hey saw jacob today kenna like jacob paulson ronny kenna campus kenna bowl kristie really kristie thought hes ottawa kenna probably visiting kenna saw jacob paulson campus bowl probably came ottawa visit
t
jackson like duo sixtynine nicki kaleigh great together eh kala yeah love fefe kala matching ja

RuntimeError: stack expects each tensor to be equal size, but got [19, 768] at entry 0 and [0] at entry 1