In [1]:
import pandas as pd
import numpy as np
import json, re
from tqdm import tqdm_notebook
from uuid import uuid4

## Torch Modules
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader

In [2]:
## PyTorch Transformer
from transformers import RobertaModel, RobertaTokenizer
from transformers import RobertaForSequenceClassification, RobertaConfig

In [3]:
## Check if Cuda is Available
print(torch.cuda.is_available())

True


In [4]:
df = pd.read_csv('../data/myPersonality/mypersonality_final.csv', encoding='latin1')
print("The size of data is {0}".format(df.shape[0]))
labels = ['cEXT','cNEU','cAGR','cCON','cOPN']

The size of data is 9917


In [5]:
data = df[['STATUS',labels[0]]]
data = data.rename(columns={'STATUS': "text", labels[0]: "label"})

In [6]:
label_to_ix = {}
for label in data.label:
    for word in label.split():
        if word not in label_to_ix:
            label_to_ix[word]=len(label_to_ix)
label_to_ix

{'n': 0, 'y': 1}

In [7]:
config = RobertaConfig.from_pretrained('roberta-base')

In [8]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification(config)

In [9]:
def prepare_features(seq_1, max_seq_length = 300, 
             zero_pad = False, include_CLS_token = True, include_SEP_token = True):
    ## Tokenzine Input
    tokens_a = tokenizer.tokenize(seq_1)

    ## Truncate
    if len(tokens_a) > max_seq_length - 2:
        tokens_a = tokens_a[0:(max_seq_length - 2)]
    ## Initialize Tokens
    tokens = []
    if include_CLS_token:
        tokens.append(tokenizer.cls_token)
    ## Add Tokens and separators
    for token in tokens_a:
        tokens.append(token)

    if include_SEP_token:
        tokens.append(tokenizer.sep_token)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    ## Input Mask 
    input_mask = [1] * len(input_ids)
    ## Zero-pad sequence lenght
    if zero_pad:
        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
    return torch.tensor(input_ids).unsqueeze(0), input_mask

In [10]:
msg = "My dog is cute!"
prepare_features(msg)

(tensor([[    0,  2387,  2335,    16, 11962,   328,     2]]),
 [1, 1, 1, 1, 1, 1, 1])

In [11]:
class Intents(Dataset):
    def __init__(self, dataframe):
        self.len = len(dataframe)
        self.data = dataframe
        
    def __getitem__(self, index):
        utterance = self.data.text[index]
        label = self.data.label[index]
        X, _  = prepare_features(utterance)
        y = label_to_ix[self.data.label[index]]
        return X, y
    
    def __len__(self):
        return self.len

In [12]:
train_size = 0.8
train_dataset=data.sample(frac=train_size,random_state=200).reset_index(drop=True)
test_dataset=data.drop(train_dataset.index).reset_index(drop=True)

In [13]:
print("FULL Dataset: {}".format(data.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

FULL Dataset: (9917, 2)
TRAIN Dataset: (7934, 2)
TEST Dataset: (1983, 2)


In [14]:
training_set = Intents(train_dataset)
testing_set = Intents(test_dataset)

In [15]:
training_set.__getitem__(0)[0].shape

torch.Size([1, 9])

In [16]:
model(training_set.__getitem__(0)[0])

(tensor([[ 0.4809, -0.0881]], grad_fn=<AddmmBackward>),)

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.cuda()

In [18]:
# Parameters
params = {'batch_size': 1,
          'shuffle': True,
          'drop_last': False,
          'num_workers': 1}

In [19]:
training_loader = DataLoader(training_set, **params)
testing_loader = DataLoader(testing_set, **params)

In [20]:
loss_function = nn.CrossEntropyLoss()
learning_rate = 1e-05
optimizer = optim.Adam(params =  model.parameters(), lr=learning_rate)

In [22]:
from torch_lr_finder import LRFinder

In [23]:
lr_finder = LRFinder(model, optimizer, loss_function, device="cuda")
lr_finder.range_test(training_loader, end_lr=100, num_iter=100)
lr_finder.plot() # to inspect the loss-learning rate graph
lr_finder.reset() # to reset the model and optimizer to their initial state

HBox(children=(FloatProgress(value=0.0), HTML(value='')))

RuntimeError: number of dims don't match in permute

In [24]:
## Test Forward Pass
inp = training_set.__getitem__(0)[0].cuda()
output = model(inp)[0]
print(output.shape)

torch.Size([1, 2])


In [25]:
torch.__version__

'1.4.0'

In [26]:
max_epochs = 3
model = model.train()
for epoch in tqdm_notebook(range(max_epochs)):
    print("EPOCH -- {}".format(epoch))
    for i, (sent, label) in enumerate(training_loader):
        optimizer.zero_grad()
        sent = sent.squeeze(0)
        if torch.cuda.is_available():
          sent = sent.cuda()
          label = label.cuda()
        output = model.forward(sent)[0]
        _, predicted = torch.max(output, 1)
        
        loss = loss_function(output, label)
        loss.backward()
        optimizer.step()
        
        if i%100 == 0:
            correct = 0
            total = 0
            for sent, label in testing_loader:
                sent = sent.squeeze(0)
                if torch.cuda.is_available():
                  sent = sent.cuda()
                  label = label.cuda()
                output = model.forward(sent)[0]
                _, predicted = torch.max(output.data, 1)
                total += label.size(0)
                correct += (predicted.cpu() == label.cpu()).sum()
            accuracy = 100.00 * correct.numpy() / total
            print('Iteration: {}. Loss: {}. Accuracy: {}%'.format(i, loss.item(), accuracy))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

EPOCH -- 0


KeyboardInterrupt: 