In [1]:
import pandas as pd
import numpy as np
import json, re
import tqdm #for visualisation of loops
from uuid import uuid4 #to generate unique ids

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
# from torch.autograd import Variable (deprecated)
from torch.utils.data import Dataset, DataLoader


In [2]:
!pip install pytorch-transformers



In [3]:
from pytorch_transformers import RobertaModel, RobertaTokenizer
from pytorch_transformers import RobertaForSequenceClassification, RobertaConfig

In [4]:
dataset_path = '/content/clean_datav4.csv'

In [5]:
dataset = pd.read_csv(dataset_path)
dataset

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,ID,tweet_id,text,missing_text,Text_Only_Informative,Image_Only_Informative,Directed_Hate,Generalized_Hate,Sarcasm,Allegation,Justification,Refutation,Support,Oppose
0,0,0,0,1.052240e+18,new post domestic violence awareness caught me...,0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,1,1,1,1.052210e+18,domestic violence awareness caught metoo,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,2,2,2,1.052180e+18,mother nature metoo,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,3,3,1.052160e+18,ption no2,0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,4,4,4,1.052100e+18,high time metoo named shamed men medium advert...,0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7973,7973,7973,7973,1.052100e+18,one priyaramani make billion people metooindia...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7974,7974,7974,7974,1.052100e+18,thought metoo limited woman condeming wake rea...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7975,7975,7975,7975,1.052100e+18,wake metoo movement hairstylist sapna bhavani ...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7976,7976,7976,7976,1.052100e+18,metoo icc step sexual harassment,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
dataset[(dataset["Support"]==1) & (dataset["Oppose"]==1)]

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,ID,tweet_id,text,missing_text,Text_Only_Informative,Image_Only_Informative,Directed_Hate,Generalized_Hate,Sarcasm,Allegation,Justification,Refutation,Support,Oppose


In [7]:
len(dataset[(dataset["Support"]==0) & (dataset["Oppose"]==0)])

4875

In [8]:
len(dataset[(dataset["Support"]==0) & (dataset["Oppose"]==1)])


602

In [9]:
len(dataset[(dataset["Support"]==1) & (dataset["Oppose"]==0)])


2501

In [10]:
config = RobertaConfig.from_pretrained('roberta-base')
config.num_labels = 3
config

{
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 3,
  "output_attentions": false,
  "output_hidden_states": false,
  "pad_token_id": 1,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 1,
  "vocab_size": 50265
}

**BERT Tokenizer**

Here, the BERT tokenizer splits the string into multiple substrings. If the substrings are in its vocabulary, they will stay as is: this is the case for `array`,  `are` and  `cool`. However, if a resulting string is not in its vocabulary, it will be split again until every string is represented by its vocabulary. For example,  `Systolic` is split multiple times until every token is represented in the BERT vocabulary: it is split into four tokens.
The BERT tokenizer is lacking when it comes to complex characters spread over multiple bytes, as can be seen with emojis. In the sequence used, an emoji of a whale was added. As the BERT tokenizer cannot interpret this emoji on a byte-level, it replaces it by the unknown token [UNK].

**RoBERTa Tokenizer**

On the other hand, the RoBERTa tokenizer has a slightly different approach. Here too, the string is split into multiple substrings, which are themselves split into multiple substrings until every substring can be represented by the vocabulary. However, the RoBERTa tokenizer has a **byte-level approach**. This tokenizer can represent every sequence as a combination of bytes, which makes it shine in the case of complex characters spread over multiple bytes, as with the whale emoji. Instead of using the unknown token, this tokenizer can correctly encode the whale emoji as the combination of multiple bytes. This tokenizer therefore does not require an unknown token, as it can handle every byte separately.

In [11]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification(config)

In [12]:
def prepare_sentence_features(sentence, max_sent_length=300, zero_pad=True,
                              include_CLS_token=True, include_SEP_token=True):
    tokens = tokenizer.tokenize(sentence)
    # print(tokens)

    if(len(tokens)>max_sent_length-2):
        tokens = tokens[:(max_sent_length-2)]
    
    result = []
    if include_CLS_token:
        result.append(tokenizer.cls_token)
    
    result += tokens

    if include_SEP_token:
        result.append(tokenizer.sep_token)

    # print(result)
    token_indices = tokenizer.convert_tokens_to_ids(result)
    input_mask = [1] * len(token_indices)

    if zero_pad:
        while(len(token_indices)<max_sent_length):
            token_indices+=[0]
            input_mask+=[0]
        
        # note that torch.unsqueeze(input, dim) Returns a new tensor with a 
        # dimension of size one inserted at the specified position.
    return torch.tensor(token_indices).unsqueeze(0), input_mask

In [13]:
print(prepare_sentence_features(dataset["text"][0]))

(tensor([[   0,   92,  618, 1897, 1476, 4199, 2037, 1145, 3036,   92, 1423, 9657,
           86, 1027,  429,  120, 2564,    2,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0, 

In [14]:
# will now have to create a custom Dataset Loader Class

In [15]:
# now need to make dataframe consisting of text and labels

In [16]:
df = dataset

In [17]:
text = df['text'].values.reshape(-1, 1)
no_rows = text.shape[0]
support = df['Support'].values.reshape(-1, 1)
oppose = df['Oppose'].values.reshape(-1, 1)
labels = np.zeros(support.shape)
labels = support * 1 + oppose * 2
twos = labels[labels==2]
ones = labels[labels==1]

In [18]:
labels = labels.astype(np.int32)
labels

array([[1],
       [1],
       [0],
       ...,
       [0],
       [0],
       [0]], dtype=int32)

In [19]:
print(ones.shape, twos.shape)
data_arr = np.concatenate((text, labels), axis=1)
data_twos=data_arr[data_arr[:, 1]==2]
data_ones=data_arr[data_arr[:, 1]==1]
data_zeros=data_arr[data_arr[:, 1]==0]

(2501,) (602,)


In [53]:
data_twos[10, 0]

'metoo potent enemy higher award harassment victim workplace '

In [20]:
train_rows = 5983 #0.75
test_rows = 7978 - 5983 #0.25

In [21]:
from sklearn.model_selection import train_test_split
ones_text_train, ones_text_test, ones_label_train, ones_label_test = train_test_split(data_ones[:, 0], data_ones[:, 1], train_size=0.9, shuffle=True)
twos_text_train, twos_text_test, twos_label_train, twos_label_test = train_test_split(data_twos[:, 0], data_twos[:, 1], train_size=0.9, shuffle=True)

I plan to include 90% of ones and 90% of twos in my train set
<br>
Remaining all will be zeros

In [22]:
req_zeros = train_rows - (ones_text_train.shape[0]+twos_text_train.shape[0])
req_zeros

3192

In [23]:
np.random.shuffle(data_zeros)
zeros_text_train = data_zeros[:req_zeros, 0]
zeros_text_test = data_zeros[req_zeros:, 0]

zeros_label_train = data_zeros[:req_zeros, 1]
zeros_label_test = data_zeros[req_zeros:, 1]

In [24]:
text_train = np.concatenate((zeros_text_train, ones_text_train, twos_text_train), axis=0)
label_train = np.concatenate((zeros_label_train, ones_label_train, twos_label_train), axis=0)

text_test = np.concatenate((zeros_text_test, ones_text_test, twos_text_test), axis=0)
label_test = np.concatenate((zeros_label_test, ones_label_test, twos_label_test), axis=0)

text_train=text_train.reshape(-1, 1)
text_test=text_test.reshape(-1, 1)
label_train=label_train.reshape(-1, 1)
label_test=label_test.reshape(-1, 1)

In [25]:
train_df = pd.DataFrame(data=np.concatenate((text_train, label_train), axis=1), columns=['text', 'labels'], index=None).sample(frac=1).reset_index(drop=True)
test_df = pd.DataFrame(data=np.concatenate((text_test, label_test), axis=1), columns=['text', 'labels'], index=None).sample(frac=1).reset_index(drop=True)

In [26]:
train_df=train_df.dropna()
train_df = train_df.reset_index(drop=True)

In [27]:
train_df[train_df.isnull().values.any(axis=1)]

Unnamed: 0,text,labels


In [28]:
test_df[test_df.isnull().values.any(axis=1)]


Unnamed: 0,text,labels


In [29]:
class Stance(Dataset):
    def __init__(self, dataframe):
        self.len = len(dataframe)
        self.data = dataframe
    
    def __getitem__(self, ind):
        label = self.data['labels'][ind]
        text = self.data['text'][ind]
        X, input_mask = prepare_sentence_features(text)
        y = label
        return X, y

    def __len__(self):
        return self.len

In [30]:
training_set = Stance(train_df)
testing_set = Stance(test_df)

In [31]:
training_set.__getitem__(0)[0].shape

torch.Size([1, 300])

In [32]:
model(training_set.__getitem__(0)[0])

(tensor([[0.1362, 0.3850, 0.1579]], grad_fn=<AddmmBackward>),)

In [33]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.cuda()
device

device(type='cuda')

In [34]:
params = {
    'batch_size':1,
    'shuffle':True,
    'drop_last':False,#does not drop the last non - full batch of each workers's dataset replica
    'num_workers':1#only create 1 worker process
}

In [35]:
training_loader = DataLoader(training_set, **params)
testing_loader = DataLoader(testing_set, **params)

In [36]:
loss_function = nn.CrossEntropyLoss()
learning_rate = 1e-05
optimizer = optim.Adam(params = model.parameters(), lr = learning_rate)

In [37]:
model.parameters()

<generator object Module.parameters at 0x7f4344314150>

In [38]:
inp = training_set.__getitem__(0)[0].cuda()
op = model(inp)[0]
print(op)

tensor([[0.3108, 0.1391, 0.2475]], device='cuda:0', grad_fn=<AddmmBackward>)


In [39]:
torch.__version__

'1.5.1+cu101'

In [40]:
a = torch.randn(1, 4)
torch.max(a, 1)

torch.return_types.max(values=tensor([1.8308]), indices=tensor([2]))

In [41]:
test_df.shape

(1995, 2)

In [42]:
# i = 0
# total =0
# correct = 0
# for sent, label in training_loader:
#     sent = sent.squeeze(0).cuda()
#     print(sent, label)
#     output = model(sent)[0]
#     _, predicted = torch.max(output.data, 1)
#     total+=label.size(0)
#     correct += (predicted.cpu() == label.cpu()).sum()
#     i+=1
#     if i==1:
#         break
# accuracy = 100.00 * correct.numpy()/total

In [43]:
test_df['text']=test_df['text'].astype('str')
train_df['text']=train_df['text'].astype('str')

In [44]:
types=[]
for s in train_df["text"]:
    types.append(str(type(s)))
np.unique(types)

array(["<class 'str'>"], dtype='<U13')

In [47]:
max_epochs = 3
model = model.train()
for epoch in tqdm.notebook.tqdm(range(max_epochs)):
    print(f"EPOCH -- {epoch}")
    for i, (sent, label) in enumerate(training_loader):
        optimizer.zero_grad()#resetting gradients
        sent = sent.squeeze(0)
        if torch.cuda.is_available():
            sent = sent.cuda()
            label = label.cuda()
        output = model.forward(sent)[0]
        _, max_ind = torch.max(output, 1)
            
        loss = loss_function(output, label)
        loss.backward()
        optimizer.step()

        if i%1000 == 0:
            correct = 0
            total = 0
            for sent, label in testing_loader:
                sent = sent.squeeze(0)
                if torch.cuda.is_available():
                    sent = sent.cuda()
                    label = label.cuda()
                output = model.forward(sent)[0]
                _, predicted = torch.max(output.data, 1)
                total+=label.size(0)
                correct += (predicted.cpu() == label.cpu()).sum()
            accuracy = 100.00 * correct.numpy()/total
            print(f"Iteration: {i}, Loss: {loss.item()}, Accuracy: {accuracy}")

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

EPOCH -- 0
Iteration: 0, Loss: 0.4755229949951172, Accuracy: 84.41102756892231
Iteration: 1000, Loss: 0.685578465461731, Accuracy: 71.9298245614035
Iteration: 2000, Loss: 1.0936509370803833, Accuracy: 84.06015037593986
Iteration: 3000, Loss: 0.9138317108154297, Accuracy: 83.85964912280701
Iteration: 4000, Loss: 1.171872854232788, Accuracy: 61.00250626566416
Iteration: 5000, Loss: 0.6362693309783936, Accuracy: 35.6390977443609
EPOCH -- 1
Iteration: 0, Loss: 1.1631195545196533, Accuracy: 83.30827067669173
Iteration: 1000, Loss: 2.5677711963653564, Accuracy: 66.9172932330827
Iteration: 2000, Loss: 0.6895544528961182, Accuracy: 38.446115288220554
Iteration: 3000, Loss: 1.1534366607666016, Accuracy: 84.06015037593986
Iteration: 4000, Loss: 0.9210395812988281, Accuracy: 70.57644110275689
Iteration: 5000, Loss: 1.0356955528259277, Accuracy: 80.45112781954887
EPOCH -- 2
Iteration: 0, Loss: 0.48753929138183594, Accuracy: 84.41102756892231
Iteration: 1000, Loss: 0.789152979850769, Accuracy: 50.0

In [51]:
path = './roberta_state_dict_'+ str(uuid4())+'.pth'
path

'./roberta_state_dict_38daed95-e4e9-44ed-8dc2-8760bbf9bf37.pth'

In [50]:
torch.save(model.state_dict(), path)

In [56]:
model(prepare_sentence_features('metoo potent enemy higher award harassment victim workplace')[0].cuda())

(tensor([[ 0.9388,  0.6549, -0.4261]], device='cuda:0', grad_fn=<AddmmBackward>),)