In [1]:
import re
import string
import numpy as np 
import random
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
from collections import Counter

from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator


import nltk
from nltk.corpus import stopwords

from tqdm import tqdm
import os
import nltk
import spacy
import random
from spacy.util import compounding
from spacy.util import minibatch

import warnings
warnings.filterwarnings("ignore")
import tokenizers
import os
from wordcloud import WordCloud, STOPWORDS
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
from sklearn.model_selection import StratifiedKFold
from transformers import *
print('imported libs ..')

  import pandas.util.testing as tm


imported libs ..


In [2]:
train = pd.read_csv('tweet-sentiment-extraction-data/train.csv')
test = pd.read_csv('tweet-sentiment-extraction-data/test.csv')

In [3]:
train.dropna(axis=0, how='any', inplace=True)
test.dropna(axis=0, how='any', inplace=True)

In [4]:
train.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [5]:
test.head()

Unnamed: 0,textID,text,sentiment
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative
3,01082688c6,happy bday!,positive
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive


**Goal:** 
To predict **selected_text** and **sentiment** from **text**

this task is extremely similar to Question Answering. The only difference is that the question has been replaced by the sentiment, the context/passage by the tweet and the answer by the portion of the tweet signifying the sentiment.

In [6]:
text = train['text'].apply(str).values
selected_text = train['selected_text'].apply(str).values
sentiment = train['sentiment'].apply(str).values

In [7]:
text[0]

' I`d have responded, if I were going'

In [8]:
selected_text[0]

'I`d have responded, if I were going'

In [9]:
sentiment[0]

'neutral'

### Pre-Process & BERT Tokenization

In [10]:
import torch
from transformers import BertConfig, TFBertPreTrainedModel, TFBertMainLayer, BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)

Padding all the sentences to max length

In [11]:
print(tokenizer.tokenize(text[0], add_special_tokens=True))

['i', '`', 'd', 'have', 'responded', ',', 'if', 'i', 'were', 'going']


In [12]:
max_len = 0
for sent in text:
    input_id = tokenizer.encode(sent, add_special_tokens = True)
    max_len = max(max_len, len(input_id))
print(max_len)

110


Input to the BERT Model is the question plus the context, ie; sentiment + text. So we take a max_len of 150. The model should be able to recognize/differentiate the sentiment part, text and the pad tokens separately. 
We need to:
* Concatenate sentiment and text
* Tokenize combined sequence for BERT
* Pad the text to 150
* Differentiate word tokens, pad tokens & sentiment & tweet

In [13]:
input_ids = []
token_type_ids = []
attention_masks = []

for i in range(len(text)):
    encoded = tokenizer.encode_plus(
    sentiment[i],
    text[i],
    add_special_tokens = True,
    max_length = 150,
    pad_to_max_length=True,
    return_token_type_ids=True,
    return_attention_mask=True,
    return_tensors = 'pt')   #can be changed to a list or pytorch type or tf tensors
    input_ids.append(encoded['input_ids'])
    attention_masks.append(encoded['attention_mask'])
    token_type_ids.append(encoded['token_type_ids'])


In [14]:
input_ids[0]

tensor([[ 101, 8699,  102, 1045, 1036, 1040, 2031, 5838, 1010, 2065, 1045, 2020,
         2183,  102,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,  

In [15]:
token_type_ids[0]

tensor([[0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0]])

In [16]:
attention_masks[0]

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0]])

In [17]:
len(input_ids)

27480

In [18]:
print(type(input_ids))

<class 'list'>


In [19]:
print(len(input_ids[0]))

1




We get 3 tensors above - input_ids, attention_masks and token_type_ids.

1) input_ids contains the sequence of ids of the tokenized form of the input sequence.

2) attention_masks contains a sequence of 0s and 1s. It helps us differentiate between the input sentences and the pad tokens.

3) token_type_ids again contains a sequence of 0s and 1s. It helps us differentiate between the first sequence (sentiment) and the second(tweet). This will be utilized by the model to add [SEP] tokens between the two sequences.

**selected_text** is to be fed as a label to the model.Since it is a part of the text, it can be identified with start and end indices

In [20]:
import numpy as np

input_ids =  torch.cat(input_ids, dim =0)
attention_masks =  torch.cat(attention_masks, dim =0)
token_type_ids =  torch.cat(token_type_ids , dim =0)

input_ids = input_ids.numpy()
attention_masks = attention_masks.numpy()
token_type_ids = token_type_ids.numpy()
print(len(input_ids), len(attention_masks), len(token_type_ids))






27480 27480 27480


In [20]:

# text_ids = input_ids[0]
# print(selected_text[0])
# selected_text_ids = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(selected_text[0], add_special_tokens=True))
# selected_text_ids

In [21]:
start_pos = []
end_pos = []
count = 0
length_text = len(input_ids)
print(length_text, "length_text")

i = 0
while i < length_text:
#     print(i)
    text_ids = input_ids[i]
#     print("text_ids", text_ids)
    selected_text_ids = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(selected_text[i], add_special_tokens=True))
#     print("selected_text_ids", selected_text_ids)
    first = selected_text_ids[0]
    
    if len(selected_text_ids) == 1:
        second = -1
    else:
        second = selected_text_ids[1]
    
    pos = -1
    ctr = -1
    for j in range(len(text_ids) - 1):
        pos += 1
        if second == -1:
            if text_ids[j] == first:
                ctr *= -1
                start_pos.append(pos)
                break
        else:
            if text_ids[j] == first and text_ids[j+1] == second:
                ctr *= -1
                start_pos.append(pos)
                break
    if ctr == -1:
        count += 1
        selected_text = np.delete(selected_text, i)
        input_ids = np.delete(input_ids, i, axis=0)
        attention_masks = np.delete(attention_masks, i, axis=0)
        token_type_ids = np.delete(token_type_ids, i, axis=0)
        
        length_text -= 1
        i -= 1
    else:
        end_pos.append(pos + len(selected_text_ids) - 1)
    i +=1

# print("count", count)

27480 length_text


In [22]:
from torch.utils.data import TensorDataset, random_split
print(len(input_ids), len(token_type_ids),len(start_pos),len( end_pos))
print(type(input_ids), type(start_pos))
start_tensor = torch.IntTensor(start_pos)
end_tensor = torch.IntTensor(end_pos)
dataset = TensorDataset(torch.from_numpy(input_ids), torch.from_numpy(attention_masks),torch.from_numpy(token_type_ids),
                       start_tensor, end_tensor)



26544 26544 26544 26544
<class 'numpy.ndarray'> <class 'list'>


In [23]:
train_split = int(0.8*len(dataset))
val_split = len(dataset) - train_split
print(train_split, val_split)
trainset, valset = random_split(dataset, [train_split, val_split])

21235 5309


In [24]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
bs = 128
trainloader = DataLoader(dataset, batch_size=bs, sampler=RandomSampler(trainset))
valloader = DataLoader(dataset, batch_size=bs, sampler=SequentialSampler(valset))

In [21]:
from transformers import BertForQuestionAnswering, BertConfig, AdamW
device = torch.device("cuda")
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased', output_attentions = False, output_hidden_states= False)
model = torch.nn.DataParallel(model) #mutligpu
model.to(device) #for multigpu
# model.cuda()

DataParallel(
  (module): BertForQuestionAnswering(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
  

In [26]:
from transformers import get_linear_schedule_with_warmup
optimizer = AdamW(model.parameters(), lr = 2e-5, eps = 1e-8)
epochs = 3
num_training_steps = epochs*len(trainloader)
scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=0, num_training_steps=num_training_steps)

In [27]:

def train():
    model.train()
    train_loss = []
    eval_loss = []
    for i in range(epochs):
        batch_loss = 0
        for step, data in enumerate(trainloader):
            if step%50==0:
                print("batch {} of {}".format(step,len(trainloader)))
            input_id = data[0].to(device)
            att_mask = data[1].to(device)
            token_type= data[2].to(device)
            start_pos = data[3].to(device)
            end_pos = data[4].to(device)
#             print(len(input_id),len(att_mask),len(token_type),len(start_pos),len(end_pos) )
            model.zero_grad()
#             print("after  zero grad!")
            loss, start_scores, end_scores = model(input_id.long(), att_mask.long(), token_type.long(), 
                                                   start_positions = start_pos.long(),
                                                  end_positions = end_pos.long())
#             peint("loss", loss.item())
            loss.sum().backward() #loss.backward() for single gpu
#             print("after backward")
            batch_loss +=  loss.sum()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
        avg_batch_loss = batch_loss/len(trainloader)
        train_loss.append(batch_loss)
        print("training loss after epoch {} is {}", avg_batch_loss)
        
        
        model.eval()
        eval_loss = []
        
        batch_eval_loss = 0
        
        for step, data in enumerate(valloader):
            if step%50==0:
                print("Validation batch {} of {}".format(step,len(valloader)))
            input_id = data[0].to(device)
            att_mask = data[1].to(device)
            token_type= data[2].to(device)
            start_pos = data[3].to(device)
            end_pos = data[4].to(device)
            
            with torch.no_grad():
                loss, start_scores, end_scores = model(input_id.long(), att_mask.long(), token_type.long(),
                                                       start_positions = start_pos.long(),
                                                  end_positions =  end_pos.long())
                batch_eval_loss += loss.sum()
        avg_batch_eval_loss = batch_eval_loss/len(valloader)
        eval_loss.append(batch_loss)
        print("validation loss after epoch {} is {}", avg_batch_eval_loss)

In [28]:
train()

batch 0 of 166
batch 50 of 166
batch 100 of 166
batch 150 of 166
training loss after epoch {} is {} tensor(2.9075, device='cuda:0', grad_fn=<DivBackward0>)
Validation batch 0 of 42
validation loss after epoch {} is {} tensor(1.7384, device='cuda:0')
batch 0 of 166
batch 50 of 166
batch 100 of 166
batch 150 of 166
training loss after epoch {} is {} tensor(1.6464, device='cuda:0', grad_fn=<DivBackward0>)
Validation batch 0 of 42
validation loss after epoch {} is {} tensor(1.4469, device='cuda:0')
batch 0 of 166
batch 50 of 166
batch 100 of 166
batch 150 of 166
training loss after epoch {} is {} tensor(1.4116, device='cuda:0', grad_fn=<DivBackward0>)
Validation batch 0 of 42
validation loss after epoch {} is {} tensor(1.3411, device='cuda:0')


In [30]:
#Save the model to use it yourself for inference
torch.save(model.state_dict(), "model.pth")


### Inference

In [22]:
#Later to restore:
model.load_state_dict(torch.load("model.pth"))
model.eval()

DataParallel(
  (module): BertForQuestionAnswering(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
  

In [23]:
test.head()

Unnamed: 0,textID,text,sentiment
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative
3,01082688c6,happy bday!,positive
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive


In [24]:
#Preprocess the test set
text_test = test['text'].apply(str).values
sentiment_test = train['sentiment'].apply(str).values

In [25]:
input_ids_test = []
token_type_ids_test = []
attention_masks_test = []

for i in range(len(text_test)):
    encoded = tokenizer.encode_plus(
    sentiment_test[i],
    text_test[i],
    add_special_tokens = True,
    max_length = 150,
    pad_to_max_length=True,
    return_token_type_ids=True,
    return_attention_mask=True,
    return_tensors = 'pt')   #can be changed to a list or pytorch type or tf tensors
    input_ids_test.append(encoded['input_ids'])
    token_type_ids_test.append(encoded['attention_mask'])
    attention_masks_test.append(encoded['token_type_ids'])

In [26]:
input_ids_test =  torch.cat(input_ids_test, dim =0)
attention_masks_test =  torch.cat(attention_masks_test, dim =0)
token_type_ids_test =  torch.cat(token_type_ids_test , dim =0)

input_ids_test = input_ids_test.numpy()
attention_masks_test = attention_masks_test.numpy()
token_type_ids_test = token_type_ids_test.numpy()
print(len(input_ids_test), len(attention_masks_test), len(token_type_ids_test))

3534 3534 3534


In [29]:
from torch.utils.data import TensorDataset, random_split

eval_dataset = TensorDataset(torch.from_numpy(input_ids_test), torch.from_numpy(attention_masks_test),torch.from_numpy(token_type_ids_test)
                       )

In [32]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

eval_sampler = SequentialSampler(eval_dataset)
eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=64)

In [35]:
start_results = []
end_results = []
for i, data in enumerate(eval_dataloader):
            
    input_id = data[0].to(device)
    att_mask = data[1].to(device)
    token_type= data[2].to(device)
    with torch.no_grad():
        start_score, end_score = model(input_id.long(), att_mask.long(), token_type.long())
        sel_text_start = torch.argmax(start_score)
        sel_text_end = torch.argmax(end_score)
        
        print(sel_text_start,sel_text_end)
        

tensor(753, device='cuda:0') tensor(322, device='cuda:0')
tensor(7353, device='cuda:0') tensor(9460, device='cuda:0')
tensor(5103, device='cuda:0') tensor(2268, device='cuda:0')
tensor(153, device='cuda:0') tensor(781, device='cuda:0')
tensor(7968, device='cuda:0') tensor(5132, device='cuda:0')
tensor(8253, device='cuda:0') tensor(8557, device='cuda:0')
tensor(903, device='cuda:0') tensor(4955, device='cuda:0')
tensor(2103, device='cuda:0') tensor(8424, device='cuda:0')
tensor(2707, device='cuda:0') tensor(4372, device='cuda:0')
tensor(4503, device='cuda:0') tensor(7980, device='cuda:0')
tensor(5553, device='cuda:0') tensor(2271, device='cuda:0')
tensor(7053, device='cuda:0') tensor(6174, device='cuda:0')
tensor(3753, device='cuda:0') tensor(4664, device='cuda:0')
tensor(9153, device='cuda:0') tensor(7359, device='cuda:0')
tensor(2553, device='cuda:0') tensor(4073, device='cuda:0')
tensor(4818, device='cuda:0') tensor(9169, device='cuda:0')
tensor(9303, device='cuda:0') tensor(5426, de

In [None]:
#to resume training
# state = {
#     'epoch': epoch,
#     'state_dict': model.state_dict(),
#     'optimizer': optimizer.state_dict(),
#     ...
# }
# torch.save(state, filepath)
# state = torch.load(filepath)
# model.load_state_dict(state['state_dict'])
# optimizer.load_state_dict(state['optimizer']) 
#no use of model.eval()

In [None]:
#to save and run with out access to code like a .pb file

# torch.save(model, filepath)

# # Then later:
# model = torch.load(filepath)