In [2]:
import os
import torch
import pandas as pd
import torch.nn as nn
import numpy as np
import torch.nn.functional as F
from torch.optim import lr_scheduler
import re
from sklearn import model_selection
from sklearn import metrics


import transformers
import tokenizers
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from tqdm.autonotebook import tqdm
import torch_utils

In [3]:
from transformers import BertTokenizer
from transformers import BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


In [4]:
infp = 'train.csv'
df = pd.read_csv(infp)
df=df.dropna()
df.iloc[10]

textID                                                  2339a9b08b
text              as much as i love to be hopeful, i reckon the...
selected_text    as much as i love to be hopeful, i reckon the ...
sentiment                                                  neutral
Name: 10, dtype: object

In [5]:
max_length = df['text'].apply(lambda a:len(a))
max_length = max_length.max()


In [6]:
max_length

141

In [6]:
class config:
    MAX_LEN = 128
    TRAIN_BATCH_SIZE = 64
    VALID_BATCH_SIZE = 16
    EPOCHS = 5
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [112]:
# turn data into tonkenizer code
def processing(tweet_content, selected_text, tokenizer, sentiment):
    selected_text = selected_text.strip()
    #remove url
    tweet_content = re.sub(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))', '', tweet_content)
    
    #sentiment ids
    sentiment_id = {'positive':3893,'negative':4997,'neutral':8699}
    
    #tokenize
    tokenizer_tweet = tokenizer.encode_plus(tweet_content)
    tokenizer_ids =  tokenizer_tweet.input_ids
    tokenizer_masks = tokenizer_tweet.attention_mask
    tokenizer_typeids = tokenizer_tweet.token_type_ids
    
    #adding sentiment
    text_ids = [101]+[sentiment_id[sentiment]]+[102]+ tokenizer_ids
    tokenizer_masks = [1]*3+tokenizer_masks
    tokenizer_typeids = [0]*3+tokenizer_typeids
    
    #padding
    padding_len =  max_length-len(tokenizer_ids)
    text_ids =  text_ids + [102]* padding_len
    tokenizer_masks =  tokenizer_masks+ [1]* padding_len
    tokenizer_typeids =  tokenizer_typeids + [0]* padding_len
    
    begining = 0
    ending = 0
    for ind in (i for i, e in enumerate(tweet_content) if e == selected_text[0]):
        if tweet_content[ind: ind+len(selected_text)] == selected_text:
            begining= ind
            ending = ind + len(selected_text) - 1
            break
    
    position = [0]*len(tweet_content)
   
    for i in range(begining, ending+1):
        position[i]=1
   
    
    return {'ids': text_ids,
            'masks':tokenizer_masks,
            'typeids':tokenizer_typeids,
           'begining':begining,
            'end':ending,
            'origin_tweet':tweet_content,
           'selected_text': selected_text,
            'sentiment':sentiment
            
          }


In [9]:
a=processing('a b c','a b',tokenizer,'negative')['begining']
torch.tensor(a, dtype=torch.long).size()

torch.Size([144])

In [99]:
class TweetDataset:
    def __init__(self, tweet, sentiment, selected_text):
        self.tweet = tweet
        self.sentiment = sentiment
        self.selected_text = selected_text
        self.tokenizer = config.tokenizer
    
    def __len__(self):
        return len(self.tweet)

    def __getitem__(self, item):
        data = processing(
            self.tweet[item], 
            self.selected_text[item], 
            self.tokenizer,
            self.sentiment[item]
        )

        return {
            'ids': torch.tensor(data["ids"], dtype=torch.long),
            'mask': torch.tensor(data["masks"], dtype=torch.long),
            'token_type_ids': torch.tensor(data["typeids"], dtype=torch.long),
            'begining': torch.tensor(data["begining"], dtype=torch.long),
            'end': torch.tensor(data["end"], dtype=torch.long),
            'orig_tweet': data["origin_tweet"],
            'selected_text': data["selected_text"],
            'sentiment': data["sentiment"],
           
        }

In [81]:
class BertUncaseModel(nn.Module):
    def __init__(self):
        super(BertUncaseModel, self).__init__()
        self.bert = transformers.BertModel.from_pretrained('bert-base-uncased')  
        self.drop_out = nn.Dropout(0.2)
        self.l0 = nn.Linear(768, 2)
 
    
    def forward(self, ids, mask, token_type_ids):
        sequence_output, pooled_output = self.bert(
            ids,
            attention_mask=mask,
            token_type_ids=token_type_ids)

      
        logits = self.l0(sequence_output)
        start_logits, end_logits = logits.split(1, dim=-1)

        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)
    
        return start_logits, end_logits
    

In [79]:
def loss_fn(start_logits, end_logits, start_positions, end_positions):
    loss_fct = nn.CrossEntropyLoss()
    start_loss = loss_fct(start_logits, start_positions)
    end_loss = loss_fct(end_logits, end_positions)
    total_loss = (start_loss + end_loss)
    return total_loss

In [68]:
class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


    def accuracy(y_pred, y_actual, topk=(1, )):
        """Computes the precision@k for the specified values of k"""
        maxk = max(topk)
        batch_size = y_actual.size(0)

        _, pred = y_pred.topk(maxk, 1, True, True)
        pred = pred.t()
        correct = pred.eq(y_actual.view(1, -1).expand_as(pred))

        res = []
        for k in topk:
            correct_k = correct[:k].view(-1).float().sum(0)
            res.append(correct_k.mul_(100.0 / batch_size))

        return res

In [131]:
def train_fn(data_loader, model, optimizer, device, scheduler=None):
    model.train()
    losses = AverageMeter()
    jaccards = AverageMeter()

    
    tk0 = tqdm(data_loader,total = len(data_loader))
    for bi, d in enumerate(tk0):
        
        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        sentiment = d["sentiment"]
        orig_selected = d["selected_text"]
        orig_tweet = d["orig_tweet"]
        targets_start=d['begining']
        targets_end = d['end']
        print(targets_start)
        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets_start = targets_start.to(device, dtype=torch.long)
        targets_end = targets_end.to(device, dtype=torch.long)


        optimizer.zero_grad()
        outputs_start, outputs_end = model(
            ids,
            mask,
            token_type_ids
        )
      
        loss = loss_fn(outputs_start, outputs_end, targets_start, targets_end)
        
        loss.backward()
        optimizer.step()
        scheduler.step()

        outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy()
        outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy()
        jaccard_scores = []
        for px, tweet in enumerate(orig_tweet):
            selected_tweet = orig_selected[px]
            tweet_sentiment = sentiment[px]
            jaccard_score, _ = calculate_jaccard_score(
                original_tweet=tweet,
                target_string=selected_tweet,
                sentiment_val=tweet_sentiment,
                idx_start=np.argmax(outputs_start[px, :]),
                idx_end=np.argmax(outputs_end[px, :]),
                
            )
            jaccard_scores.append(jaccard_score)

        jaccards.update(np.mean(jaccard_scores), ids.size(0))
        losses.update(loss.item(), ids.size(0))
        tk0.set_postfix(loss=losses.avg, jaccard=jaccards.avg)   

In [158]:
def calculate_jaccard_score(
    original_tweet, 
    target_string, 
    sentiment_val, 
    idx_start, 
    idx_end, 
    verbose=False):
    
    if idx_end < idx_start:
        idx_end = idx_start
    if idx_end> len(original_tweet)-1:
        idx_end=len(original_tweet)-1

    filtered_output  = ""
    for ix in range(idx_start, idx_end + 1):
        filtered_output += original_tweet[ix]

    if sentiment_val == "neutral" or len(original_tweet.split()) < 2:
        filtered_output = original_tweet
   
    print(filtered_output)
    def jaccard (str1,str2):
        a = set(str1.lower().split())
        b = set(str2.lower().split())
        c= a.intersection(b)
        return float(len(c))/(len(a)+len(b)-len(c))
    jac = jaccard (target_string.strip(), filtered_output.strip())
    return jac, filtered_output


def eval_fn(data_loader, model, device):
    model.eval()
    losses = AverageMeter()
    jaccards = AverageMeter()
    
    with torch.no_grad():
        tk0 = tqdm(data_loader, total=len(data_loader))
        for bi, d in enumerate(tk0):
            ids = d["ids"]
            token_type_ids = d["token_type_ids"]
            mask = d["mask"]
            sentiment = d["sentiment"]
            orig_selected = d["selected_text"]
            orig_tweet = d["orig_tweet"]
            targets_start = d["begining"]
            targets_end = d["end"]
           

            ids = ids.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets_start = targets_start.to(device, dtype=torch.long)
            targets_end = targets_end.to(device, dtype=torch.long)

            outputs_start, outputs_end = model(
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids
            )
            loss = loss_fn(outputs_start, outputs_end, targets_start, targets_end)
            outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy()
            outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy()
            jaccard_scores = []
            for px, tweet in enumerate(orig_tweet):
                selected_tweet = orig_selected[px]
                tweet_sentiment = sentiment[px]
                jaccard_score, _ = calculate_jaccard_score(
                    original_tweet=tweet,
                    target_string=selected_tweet,
                    sentiment_val=tweet_sentiment,
                    idx_start=np.argmax(outputs_start[px, :]),
                    idx_end=np.argmax(outputs_end[px, :]),
                   
                )
                jaccard_scores.append(jaccard_score)

            jaccards.update(np.mean(jaccard_scores), ids.size(0))
            losses.update(loss.item(), ids.size(0))
            tk0.set_postfix(loss=losses.avg, jaccard=jaccards.avg)
    
    print(f"Jaccard = {jaccards.avg}")
    return jaccards.avg

In [159]:
def run():
    dfx = pd.read_csv('train.csv').dropna().reset_index(drop = True).iloc[:100]
   
    df_train,df_valid = model_selection.train_test_split(
        dfx,
        test_size = 0.1,
        random_state = 42,
        stratify = dfx.sentiment.values
    )
    df_train = df_train.reset_index(drop = True)
    df_valid = df_valid.reset_index(drop = True)
    
    train_dataset=TweetDataset(
        tweet=df_train.text.values,
        sentiment=df_train.sentiment.values,
        selected_text=df_train.selected_text.values
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=32,
        num_workers=4
    )

    valid_dataset = TweetDataset(
        tweet=df_valid.text.values,
        sentiment=df_valid.sentiment.values,
        selected_text=df_valid.selected_text.values
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=16,
        num_workers=2
    )

    
    device = torch.device('cpu')
    model = BertUncaseModel()
   

    num_train_steps = int(len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
    ]
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=0, 
        num_training_steps=num_train_steps
    )

    for epoch in range(3):
        train_fn(train_data_loader, model, optimizer,device, scheduler=scheduler)
        jaccard = eval_fn(valid_data_loader,model,device)
        print(f"Jaccard Score = {jaccard}")
        

In [None]:

 test_dataset=TweetDataset(
        tweet=df_test.text.values,
        sentiment=df_test.sentiment.values,
        selected_text=df_test.selected_text.values
    )

data_loader = torch.utils.data.DataLoader(
    test_dataset,
    shuffle=False,
    batch_size=32
    num_workers=1
)

with torch.no_grad():
    tk0 = tqdm(data_loader, total=len(data_loader))
    for bi, d in enumerate(tk0):
        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        sentiment = d["sentiment"]
        orig_selected = d["selected_text"]
        orig_tweet = d["orig_tweet"]
        targets_start = d["begining"]
        targets_end = d["end"]
           

        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets_start = targets_start.to(device, dtype=torch.long)
        targets_end = targets_end.to(device, dtype=torch.long)

        outputs_start, outputs_end = model(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )
      
       
        
        outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy()
        outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy()

        for px, tweet in enumerate(orig_tweet):
                selected_tweet = orig_selected[px]
                tweet_sentiment = sentiment[px]
                jaccard_score, output_sentence = calculate_jaccard_score(
                    original_tweet=tweet,
                    target_string=selected_tweet,
                    sentiment_val=tweet_sentiment,
                    idx_start=np.argmax(outputs_start[px, :]),
                    idx_end=np.argmax(outputs_end[px, :]),
                   
                )
            final_output.append(output_sentence)