In [1]:
import torch
import re
from transformers import BertTokenizer, BertModel
import transformers
import spacy
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
import tokenizers




In [2]:
class AverageMeter:
    """
    Computes and stores the average and current value
    """
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [3]:
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 16
EPOCHS = 15

In [4]:
print('We will use the GPU:', torch.cuda.get_device_name(0))
device = torch.device('cuda:0')

We will use the GPU: Tesla K80


In [5]:
df = pd.read_csv('../../../datasets/tweet-sentiment-extraction/train.csv')
df.dropna(inplace=True)
df.reset_index(inplace=True, drop=True)

In [6]:
df.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [8]:

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Loading BERT tokenizer...


In [0]:
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

remove_accented_chars('Sómě Áccěntěd těxt')

'Some Accented text'

In [9]:
TEXT = text[1]
print(TEXT)
# Print the original sentence.
print(' Original: ', TEXT)

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(TEXT))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(TEXT)))

 Sooo SAD I will miss you here in San Diego!!!
 Original:   Sooo SAD I will miss you here in San Diego!!!
Tokenized:  ['soo', '##o', 'sad', 'i', 'will', 'miss', 'you', 'here', 'in', 'san', 'diego', '!', '!', '!']
Token IDs:  [17111, 2080, 6517, 1045, 2097, 3335, 2017, 2182, 1999, 2624, 5277, 999, 999, 999]


In [36]:
text = df.text.values
sentiment = df.sentiment.values
selected_text = df.selected_text.values
text = text[0]
sentiment = sentiment[0]
selected_text = selected_text[0]

tok_text = tokenizer.encode_plus(
                        sentiment,
                        text,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 128,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_offsets = True
                   )
tok_sel = tokenizer.encode_plus(selected_text, 
                                 add_special_tokens = False,
                                 max_length = len(selected_text),
                                 return_attention_mask = False,   # Construct attn. masks.
                                 return_token_type_ids = False
                                 )

input_ids_sel = tok_sel['input_ids']

input_ids = tok_text['input_ids']
attention_mask = tok_text['attention_mask']
token_type_ids = tok_text['token_type_ids']


print(input_ids)
print(input_ids_sel)
len_st = len(input_ids_sel)

idx0 = -1
idx1 = -1

for i,tok in enumerate(input_ids):
    if tok == input_ids_sel[0]:
        if input_ids[i:i+len_st] == input_ids_sel:
            idx0 = i
            idx1 = i + len_st
            break

# target_start = [0]*len(input_ids)
# target_start[idx0] = 1

# target_end = [0]*len(input_ids)
# target_end[idx1] = 1
target_start = idx0
target_end = idx1-1

og = tokenizer.decode(input_ids).split()

# print(target_start, target_end)

final = ' '.join(og[idx0:idx1-1])

print(final)

def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

inter = 'I`d have responded, if I were going'
print(final)
print(selected_text)
jaccard(inter, selected_text)



[101, 8699, 102, 1045, 1036, 1040, 2031, 5838, 1010, 2065, 1045, 2020, 2183, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1045, 1036, 1040, 2031, 5838, 1010, 2065, 1045, 2020, 2183]
i ` d have responded, if i were going
i ` d have responded, if i were going
I`d have responded, if I were going


1.0

In [10]:
MAX_LEN = 128

In [8]:
def process(text, selected_text, sentiment, tokenizer, max_len):
    tok_text = tokenizer.encode_plus(
                        sentiment,
                        text,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 128,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_offsets = True
                   )
    tok_sel = tokenizer.encode_plus(selected_text, 
                                     add_special_tokens = False,
                                     max_length = len(selected_text),
                                     return_attention_mask = False,   # Construct attn. masks.
                                     return_token_type_ids = False
                                     )

    input_ids_sel = tok_sel['input_ids']
    
    input_ids = tok_text['input_ids']
    attention_mask = tok_text['attention_mask']
    token_type_ids = tok_text['token_type_ids']


    len_st = len(input_ids_sel)

    idx0 = -1
    idx1 = -1
    
    for i,tok in enumerate(input_ids):
        if tok == input_ids_sel[0]:
            if input_ids[i:i+len_st] == input_ids_sel:
                idx0 = i
                idx1 = i + len_st
                break

    target_start = [0]*len(input_ids)
    target_start[idx0] = 1

    target_end = [0]*len(input_ids)
    target_end[idx1] = 1
    
    return {
        'ids': input_ids,
        'attention_mask': attention_mask,
        'token_type_ids': token_type_ids,
        'target_start': target_start,
        'target_end': target_end,
        'text': text,
        'selected_text': selected_text,
        'sentiment': sentiment,
    }
    

In [9]:
class TweetDataset():
    def __init__(self, df, tokenizer, max_len=128):
        self.tweet = df.text.values
        self.selected_text = df.selected_text.values
        self.sentiment = df.sentiment.values
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.tweet)
    
    def __getitem__(self, item):
        data = process(
            self.tweet[item], 
            self.selected_text[item], 
            self.sentiment[item],
            self.tokenizer,
            self.max_len
        )

        return {
            'ids': torch.tensor(data["ids"], dtype=torch.long),
            'attention_mask': torch.tensor(data["attention_mask"], dtype=torch.long),
            'token_type_ids': torch.tensor(data["token_type_ids"], dtype=torch.long),
            'target_start': torch.tensor(data["target_start"], dtype=torch.long),
            'target_end': torch.tensor(data["target_end"], dtype=torch.long),
            'text': data["text"],
            'selected_text': data["selected_text"],
            'sentiment': data["sentiment"],
        }

In [10]:
data = TweetDataset(df, tokenizer)

In [11]:
data[0]

{'ids': tensor([ 101, 8699,  102, 1045, 1036, 1040, 2031, 5838, 1010, 2065, 1045, 2020,
         2183,  102,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0

In [28]:
# NUM = 0


# tok_text = tokenizer.encode_plus(
#                         sentiment[NUM],
#                         text[NUM],                      # Sentence to encode.
#                         add_special_tokens = True, # Add '[CLS]' and '[SEP]'
#                         max_length = 128,           # Pad & truncate all sentences.
#                         pad_to_max_length = True,
#                         return_attention_mask = True,   # Construct attn. masks.
# #                         return_tensors = 'pt',     # Return pytorch tensors.
#                         return_offsets = True
#                    )
# tok_sel = tokenizer.encode_plus(selected_text[NUM], 
#                                  add_special_tokens = False,
#                                  max_length = len(selected_text[NUM]),
# #                                  return_tensors = 'pt',
#                                  return_attention_mask = False,   # Construct attn. masks.
#                                  return_token_type_ids = False
#                                  )

# input_ids_sel = tok_sel['input_ids']
# input_ids = tok_text['input_ids']
# attention_mask = tok_text['attention_mask']
# token_type_ids = tok_text['token_type_ids']


# len_st = len(input_ids_sel)


# for i,tok in enumerate(input_ids):
#     if tok == input_ids_sel[0]:
#         if input_ids[i:i+len_st] == input_ids_sel:
#             idx0 = i
#             idx1 = i + len_st
#             break
            
# target_start = [0]*len(input_ids)
# target_start[idx0] = 1

# target_end = [0]*len(input_ids)
# target_end[idx1] = 1




# print(input_ids)
# print("target_start", target_start)
# print("target_end", target_end)

[101, 8699, 102, 1045, 1036, 1040, 2031, 5838, 1010, 2065, 1045, 2020, 2183, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
target_start [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
target_end [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [12]:
from sklearn.model_selection import train_test_split

train, valid = train_test_split(df, test_size=0.14, shuffle=True)

In [13]:
train_loader = torch.utils.data.DataLoader(TweetDataset(train, tokenizer), batch_size = TRAIN_BATCH_SIZE, num_workers = 4)
valid_loader = torch.utils.data.DataLoader(TweetDataset(valid, tokenizer), batch_size = VALID_BATCH_SIZE, num_workers = 4)

In [14]:
import torch
import transformers
import torch.nn as nn
from torch.nn import functional as F


class TweetModel(transformers.BertPreTrainedModel):
    def __init__(self, conf):
        super(TweetModel, self).__init__(conf)
        self.bert = transformers.BertModel(conf)
        self.drop_out = nn.Dropout(0.1)
        self.l0 = nn.Linear(768 * 2, 2)
        torch.nn.init.normal_(self.l0.weight, std=0.02)
    
    def forward(self, ids, mask, token_type_ids):
        _, _, out = self.bert(
            ids,
            attention_mask=mask,
            token_type_ids=token_type_ids
        )

        out = torch.cat((out[-1], out[-2]), dim=-1)
        out = self.drop_out(out)
        logits = self.l0(out)

        start_logits, end_logits = logits.split(1, dim=-1)

        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        return start_logits, end_logits




In [19]:
def loss_fn(start_logits, end_logits, start_positions, end_positions):
    loss_fct = nn.CrossEntropyLoss()
    start_loss = loss_fct(start_logits, start_positions)
    end_loss = loss_fct(end_logits, end_positions)
    total_loss = (start_loss + end_loss)
    return total_loss

In [51]:
import utils
import torch
import torch.nn as nn
from tqdm import tqdm
import numpy as np
import re



def loss_fn(start_logits, end_logits, start_positions, end_positions):
    loss_fct = nn.CrossEntropyLoss()
    print(start_logits)
    print(start_positions)
    start_loss = loss_fct(start_logits, start_positions)
    end_loss = loss_fct(end_logits, end_positions)
    total_loss = (start_loss + end_loss)
    return total_loss


def train_fn(data_loader, model, optimizer, device, scheduler=None):
    model.train()
    model.zero_grad()
    losses = utils.AverageMeter()
    jaccards = utils.AverageMeter()

    tk0 = tqdm(data_loader, total=len(data_loader))
    
    for bi, d in enumerate(tk0):

        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["attention_mask"]
        targets_start = d["target_start"]
        targets_end = d["target_end"]
        sentiment = d["sentiment"]
        orig_selected = d["selected_text"]
        orig_tweet = d["text"]


        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets_start = torch.argmax(targets_start).to(device, dtype=torch.long)
        targets_end = torch.argmax(targets_end).to(device, dtype=torch.long)

        outputs_start, outputs_end = model(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids,
        )
        

        
        
        
        print(targets_start.shape, outputs_start.shape)
        
#         print(outputs_start)
#         print(targets_start)
        outputs_start = torch.softmax(outputs_start, dim=1)
        outputs_end = torch.softmax(outputs_end, dim=1)
        
        loss = loss_fn(outputs_start, outputs_end, targets_start, targets_end)
        loss.backward()
        
        
        
        optimizer.step()
        if scheduler is not None:
            scheduler.step()
        model.zero_grad()

        
#         jaccard_scores = []
#         for px, tweet in enumerate(orig_tweet):
#             selected_tweet = orig_selected[px]
#             tweet_sentiment = sentiment[px]
#             jaccard_score = calculate_jaccard_score(
#                 original_tweet=tweet,
#                 target_string=selected_tweet,
#                 sentiment_val=tweet_sentiment,
#                 idx_start=np.argmax(outputs_start[px, :]),
#                 idx_end=np.argmax(outputs_end[px, :]),
#                 offsets_start=offsets_start[px, :],
#                 offsets_end=offsets_end[px, :]
#             )
#             jaccard_scores.append(jaccard_score)

#         jaccards.update(np.mean(jaccard_scores), ids.size(0))
        losses.update(loss.item(), ids.size(0))
        tk0.set_postfix(loss=losses.avg)




In [54]:
device = torch.device("cuda:0")
model_config = transformers.BertConfig.from_pretrained('bert-base-uncased')
model_config.output_hidden_states = True
model = TweetModel(conf=model_config)
model.to(device)


num_train_steps = int(len(train) / TRAIN_BATCH_SIZE * EPOCHS)

optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

In [55]:
for epoch in range(15):
        train_fn(train_loader, model, optimizer, device, scheduler=scheduler)

  0%|          | 0/739 [00:00<?, ?it/s]


OSError: [Errno 12] Cannot allocate memory