In [None]:
import numpy as np 
import pandas as pd
from pprint import pprint
import random

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold

from tqdm.autonotebook import tqdm

import torch
import torch.nn as nn
from torch.nn import BCEWithLogitsLoss
from transformers import BertTokenizer,AdamW,BertPreTrainedModel,BertModel,get_linear_schedule_with_warmup,BertConfig
device = torch.device("cuda")

In [2]:
#device = torch.device("cpu")

In [4]:
df = pd.read_excel('input/Entity_sentiment_trainV2.xlsx')
df_test = pd.read_excel('input/Entity_sentiment_testV2.xlsx')

In [5]:
df.head()

Unnamed: 0,Sentence,Entity,Sentiment
0,The website was very easy to use and my insura...,website,positive
1,The web sight was easy to understand and I got...,web sight,positive
2,Having filled in the application on-line I cou...,point,negative
3,After finding AXA was cheaper than my renewal ...,prices,positive
4,The quote was a reasonable price compared with...,insurances,positive


In [7]:
idx = random.choice(range(len(df)))
pprint(df.Sentence.iloc[idx])
pprint(df.Entity.iloc[idx])
pprint(df.Sentiment.iloc[idx])

"ps bruce if you've won the euromillions don't forget to pay my years worth off"
'years'
'negative'


In [8]:
df.isna().sum()

Sentence     0
Entity       0
Sentiment    0
dtype: int64

In [9]:
df.Sentiment.value_counts()

positive    4100
negative    1899
Name: Sentiment, dtype: int64

In [10]:
#Checking the length of Entity Tokens
#[(i,t)  for i,t in enumerate(df.Entity) if len(t.split()) !=1]

# Main Function 1- config

In [13]:

class config:
    MAX_LEN = 128
    TRAIN_BATCH_SIZE = 32
    VALID_BATCH_SIZE = 16
    EPOCHS = 2
    BERT_PATH = "../input/bert-base-uncased/"
    MODEL_PATH = "pytorch_model.bin"
    print('Loading BERT tokenizer...')
    TOKENIZER = BertTokenizer.from_pretrained(f"{BERT_PATH}/vocab.txt", do_lower_case=True)
    
    

Loading BERT tokenizer...


In [14]:
#Get maximu length
"""max_len = 0
sentences = df.Sentence.tolist() + df_test.Sentence.tolist()

# For every sentence...
for sent in sentences:

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = config.TOKENIZER.encode(sent, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)"""

"max_len = 0\nsentences = df.Sentence.tolist() + df_test.Sentence.tolist()\n\n# For every sentence...\nfor sent in sentences:\n\n    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.\n    input_ids = config.TOKENIZER.encode(sent, add_special_tokens=True)\n\n    # Update the maximum sentence length.\n    max_len = max(max_len, len(input_ids))\n\nprint('Max sentence length: ', max_len)"

In [15]:
config.MAX_LEN = 155

In [16]:
def format_col(x):
    #print(x)
    sentence = config.TOKENIZER.tokenize(x['Sentence'])
    entity = config.TOKENIZER.tokenize(x['Entity'])
    ent_index = []
    for ent in entity:
        try:
            ent_index.append(str(sentence.index(ent) + 1))
        except:
            pass
    if len(ent_index) > 0:
        return ' '.join(ent_index)

    return '-1'

In [17]:
def process_data(sentence, entity, sentiment,entity_indexes, tokenizer, max_len):
    
    
    encoded_dict = tokenizer.encode_plus(
                        sentence,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = max_len,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True   # Construct attn. masks.
                                       )
    
    input_ids = encoded_dict['input_ids']
    
    mask = encoded_dict['attention_mask']
    
    label = 1 if sentiment == 'positive' else 0
    

    return {
        'ids': input_ids,
        'mask': mask,
        'entity_indexes':entity_indexes,
        'target': label,
        'sentence': sentence,
        'entity': entity
       
    }


In [19]:
#X = process_data(df.Sentence.iloc[7], df.Entity.iloc[7], df.Sentiment.iloc[7],df.entity_indexes.iloc[7], config.TOKENIZER, config.MAX_LEN)

In [20]:
class SentimentDataset:
    def __init__(self, sentence, entity, sentiment,entity_indexes):
        self.sentence = sentence
        self.entity = entity
        self.sentiment = sentiment
        self.entity_indexes = entity_indexes
        self.tokenizer = config.TOKENIZER
        self.max_len = config.MAX_LEN
    
    def __len__(self):
        return len(self.sentence)

    def __getitem__(self, item):
        data = process_data(
            self.sentence[item], 
            self.entity[item], 
            self.sentiment[item],
            self.entity_indexes[item],
            self.tokenizer,
            self.max_len
        )

        return {
            'ids': torch.tensor(data["ids"], dtype=torch.long),
            'mask': torch.tensor(data["mask"], dtype=torch.long),
            'target': torch.tensor(data["target"]),
            'entity_indexes':data['entity_indexes'],
            'sentence':data['sentence'],
            'entity':data['entity']
        }
            

In [21]:
#X = SentimentDataset(df.Sentence.tolist(), df.Entity.tolist(), df.Sentiment.tolist())

In [23]:
class EntitySentimentModel(BertPreTrainedModel):
    def __init__(self, conf):
        super(EntitySentimentModel, self).__init__(conf)
        self.loss_fn = nn.BCEWithLogitsLoss()
        self.bert = BertModel.from_pretrained(config.BERT_PATH, config=conf)
        self.drop_out = nn.Dropout(0.1)
        self.classifier = nn.Linear(768, 1)
        torch.nn.init.normal_(self.classifier.weight, std=0.02)
    
    def forward(self, ids, mask,ent_indexes,labels = None):
        
        _, _, out = self.bert(
            ids,
            attention_mask=mask,
            token_type_ids=None
        )
        
        last_hidden_state = out[-1]
        #print('Hidden State Shape-----',last_hidden_state.shape)
        #print('Ent Inx shape-----------',len(ent_indexes))
        
        ent_word_embeds = []
        #for i,ent_index in enumerate()
        for i,idx in enumerate(ent_indexes):
            ent_embedds = []
            ent_idx = [int(t) for t in idx.split()]
            for j in ent_idx:
                ent_embedds.append(last_hidden_state[i,j,:].squeeze())
            ent_embedds = torch.mean(torch.stack(ent_embedds,1),1)
            #print('last try',ent_embedds.shape)
            ent_word_embeds.append(ent_embedds)
            #ent_word_embeds[i,:] = ent_embedds.unsqueeze(0)
        
        ent_word_embeds = torch.stack(ent_word_embeds)
        #print('Concat Shape----------',ent_word_embeds.shape)
        ent_word_embeds = self.drop_out(ent_word_embeds)
        #print('Dropout Shape----------',ent_word_embeds.shape)
        logits = self.classifier(ent_word_embeds).squeeze()
        
        if labels is not None:
            #print('*******',labels)
            #print('---------',logits.shape)
            labels = labels.type_as(logits)
            #print('hiiiiii',labels[:1])
            loss = self.loss_fn(logits, labels)
            return loss
      
        return logits


In [24]:
def train_fn(data_loader, model, optimizer, device, scheduler=None):
    model.train()
    total_loss = 0
    tk0 = tqdm(data_loader, total=len(data_loader))
    
    for bi, d in enumerate(tk0):

        ids = d["ids"]
        mask = d["mask"]
        target = d["target"]
        entity_indexes = d["entity_indexes"]
        sentence = d["sentence"]
        entity = d["entity"]
        
        #print('Iterating on first data point')
        #print(entity_indexes)
  

        ids = ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        target = target.to(device, dtype=torch.long)

        model.zero_grad()
        loss = model(
            ids=ids,
            mask=mask,
            ent_indexes = entity_indexes,
            labels=target
        )
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()
        scheduler.step()
    
    avg_train_loss = total_loss / len(data_loader) 
    print(f'Average Loss at  is {avg_train_loss}')

     

In [25]:
#Validation Code

def accuracy_params(pred,true):
    pred_classes = [1 if p>0.5 else 0 for p in pred]
    
    accuracy = accuracy_score(true, pred_classes)
    print('Accuracy: %f' % accuracy)
    # precision tp / (tp + fp)
    precision = precision_score(true, pred_classes)
    print('Precision: %f' % precision)
    # recall: tp / (tp + fn)
    recall = recall_score(true, pred_classes)
    print('Recall: %f' % recall)
    # f1: 2 tp / (2 tp + fp + fn)
    f1 = f1_score(true, pred_classes)
    print('F1 score: %f' % f1)
    

def eval_fn(data_loader, model, device,test = False):
    model.eval()
    test_predictions = []
    true = []
    with torch.no_grad():
        tk0 = tqdm(data_loader, total=len(data_loader))
        for bi, d in enumerate(tk0):
            ids = d["ids"]
            mask = d["mask"]
            target = d["target"]
            entity_indexes = d["entity_indexes"]
            sentence = d["sentence"]
            entity = d["entity"]

            ids = ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            target = target.to(device, dtype=torch.long)

            logits = model(
                ids=ids,
                mask=mask,
                ent_indexes = entity_indexes
            )
            
            target = target.type_as(logits)
            
            test_predictions.append(logits.cpu().detach().numpy())
            true.append(target.cpu().detach().numpy())
            
            loss = nn.BCEWithLogitsLoss()(logits, target)
            print('Loss during Validation Data',loss)
            
        test_predictions = np.concatenate( test_predictions, axis=0 ) 
        true = np.concatenate(true, axis=0 )
        
        test_predictions = torch.sigmoid(torch.tensor(test_predictions)).numpy()
        
        accuracy_params(test_predictions,true)
           

In [26]:
def run(df_train,df_valid = None,fold = None):
    #dfx = pd.read_csv(config.TRAINING_FILE)

    #df_train = dfx[dfx.kfold != fold].reset_index(drop=True)
    #df_valid = dfx[dfx.kfold == fold].reset_index(drop=True)

    train_dataset = SentimentDataset(
        sentence=df_train.Sentence.values,
        entity=df_train.Entity.values,
        sentiment=df_train.Sentiment.values,
        entity_indexes = df_train.entity_indexes.values
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=2
    )

    valid_dataset = SentimentDataset(
        sentence=df_valid.Sentence.values,
        entity=df_valid.Entity.values,
        sentiment=df_valid.Sentiment.values,
        entity_indexes = df_valid.entity_indexes.values
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        shuffle = False,
        num_workers=2
    )

    device = torch.device("cuda")
    model_config = BertConfig.from_pretrained(config.BERT_PATH)
    model_config.output_hidden_states = True
    model = EntitySentimentModel(conf=model_config)
    model.to(device)

    num_train_steps = int(len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
    ]
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=0, 
        num_training_steps=num_train_steps
    )


    
    # I'm training only for 2 epochs and saving the model on last iteration
    for epoch in range(3):
        train_fn(train_data_loader, model, optimizer, device, scheduler=scheduler)
        eval_fn(valid_data_loader, model, device)
        
        if epoch == 2:
            torch.save(model.state_dict(), f'fold_{fold}')
    

# Main Function 2 - train_model

In [27]:
def train_model(df):
    df['entity_indexes'] = df.apply(format_col,axis = 1)
    from sklearn.model_selection import StratifiedKFold
    kf = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)
    i=0
    for train_index,valid_index in kf.split(df.Sentence,df.Sentiment):
        df_train,df_valid = df.loc[train_index],df.loc[valid_index]
        run(df_train,df_valid,i)
        i +=1
        
train_model(df)

HBox(children=(FloatProgress(value=0.0, max=150.0), HTML(value='')))


Average Loss at  is 0.2859835806985696


HBox(children=(FloatProgress(value=0.0, max=75.0), HTML(value='')))

Loss during Validation Data tensor(0.2677, device='cuda:0')
Loss during Validation Data tensor(0.0215, device='cuda:0')
Loss during Validation Data tensor(0.3378, device='cuda:0')
Loss during Validation Data tensor(0.2631, device='cuda:0')
Loss during Validation Data tensor(0.1872, device='cuda:0')
Loss during Validation Data tensor(0.1317, device='cuda:0')
Loss during Validation Data tensor(0.1360, device='cuda:0')
Loss during Validation Data tensor(0.2116, device='cuda:0')
Loss during Validation Data tensor(0.4272, device='cuda:0')
Loss during Validation Data tensor(0.1225, device='cuda:0')
Loss during Validation Data tensor(0.1414, device='cuda:0')
Loss during Validation Data tensor(0.0847, device='cuda:0')
Loss during Validation Data tensor(0.0541, device='cuda:0')
Loss during Validation Data tensor(0.0783, device='cuda:0')
Loss during Validation Data tensor(0.0376, device='cuda:0')
Loss during Validation Data tensor(0.0474, device='cuda:0')
Loss during Validation Data tensor(0.213

HBox(children=(FloatProgress(value=0.0, max=150.0), HTML(value='')))


Average Loss at  is 0.1224664031714201


HBox(children=(FloatProgress(value=0.0, max=75.0), HTML(value='')))

Loss during Validation Data tensor(0.3098, device='cuda:0')
Loss during Validation Data tensor(0.0078, device='cuda:0')
Loss during Validation Data tensor(0.2456, device='cuda:0')
Loss during Validation Data tensor(0.2537, device='cuda:0')
Loss during Validation Data tensor(0.2418, device='cuda:0')
Loss during Validation Data tensor(0.0672, device='cuda:0')
Loss during Validation Data tensor(0.1119, device='cuda:0')
Loss during Validation Data tensor(0.1697, device='cuda:0')
Loss during Validation Data tensor(0.4367, device='cuda:0')
Loss during Validation Data tensor(0.1407, device='cuda:0')
Loss during Validation Data tensor(0.1225, device='cuda:0')
Loss during Validation Data tensor(0.0210, device='cuda:0')
Loss during Validation Data tensor(0.0248, device='cuda:0')
Loss during Validation Data tensor(0.0293, device='cuda:0')
Loss during Validation Data tensor(0.0171, device='cuda:0')
Loss during Validation Data tensor(0.0264, device='cuda:0')
Loss during Validation Data tensor(0.257

HBox(children=(FloatProgress(value=0.0, max=150.0), HTML(value='')))


Average Loss at  is 0.08771276794994871


HBox(children=(FloatProgress(value=0.0, max=75.0), HTML(value='')))

Loss during Validation Data tensor(0.3098, device='cuda:0')
Loss during Validation Data tensor(0.0078, device='cuda:0')
Loss during Validation Data tensor(0.2456, device='cuda:0')
Loss during Validation Data tensor(0.2537, device='cuda:0')
Loss during Validation Data tensor(0.2418, device='cuda:0')
Loss during Validation Data tensor(0.0672, device='cuda:0')
Loss during Validation Data tensor(0.1119, device='cuda:0')
Loss during Validation Data tensor(0.1697, device='cuda:0')
Loss during Validation Data tensor(0.4367, device='cuda:0')
Loss during Validation Data tensor(0.1407, device='cuda:0')
Loss during Validation Data tensor(0.1225, device='cuda:0')
Loss during Validation Data tensor(0.0210, device='cuda:0')
Loss during Validation Data tensor(0.0248, device='cuda:0')
Loss during Validation Data tensor(0.0293, device='cuda:0')
Loss during Validation Data tensor(0.0171, device='cuda:0')
Loss during Validation Data tensor(0.0264, device='cuda:0')
Loss during Validation Data tensor(0.257

HBox(children=(FloatProgress(value=0.0, max=150.0), HTML(value='')))


Average Loss at  is 0.2659868985414505


HBox(children=(FloatProgress(value=0.0, max=75.0), HTML(value='')))

Loss during Validation Data tensor(0.2777, device='cuda:0')
Loss during Validation Data tensor(0.1645, device='cuda:0')
Loss during Validation Data tensor(0.0912, device='cuda:0')
Loss during Validation Data tensor(0.0411, device='cuda:0')
Loss during Validation Data tensor(0.2470, device='cuda:0')
Loss during Validation Data tensor(0.1489, device='cuda:0')
Loss during Validation Data tensor(0.2336, device='cuda:0')
Loss during Validation Data tensor(0.2246, device='cuda:0')
Loss during Validation Data tensor(0.1003, device='cuda:0')
Loss during Validation Data tensor(0.0788, device='cuda:0')
Loss during Validation Data tensor(0.3668, device='cuda:0')
Loss during Validation Data tensor(0.0352, device='cuda:0')
Loss during Validation Data tensor(0.0322, device='cuda:0')
Loss during Validation Data tensor(0.0670, device='cuda:0')
Loss during Validation Data tensor(0.0337, device='cuda:0')
Loss during Validation Data tensor(0.1434, device='cuda:0')
Loss during Validation Data tensor(0.086

HBox(children=(FloatProgress(value=0.0, max=150.0), HTML(value='')))

KeyboardInterrupt: 

# Main Function 3- test_model()

In [28]:
def test_model(df_test)
    device = torch.device("cuda")
    model_config = BertConfig.from_pretrained(config.BERT_PATH)
    model_config.output_hidden_states = True

    model1 = EntitySentimentModel(conf=model_config)
    model1.to(device)
    model1.load_state_dict(torch.load("fold_0"))
    model1.eval()
    

    df_test.loc[:, "Sentiment"] = 0
    df_test['entity_indexes'] = df_test.apply(format_col,axis = 1)


    test_dataset = SentimentDataset(
        sentence=df_test.Sentence.values,
        entity=df_test.Entity.values,
        sentiment=df_test.Sentiment.values,
        entity_indexes = df_test.entity_indexes.values
    )

    test_data_loader = torch.utils.data.DataLoader(
        test_dataset,
        shuffle = False,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=2
    )

    test_predictions = []
    with torch.no_grad():
        tk0 = tqdm(test_data_loader, total=len(test_data_loader))
        for bi, d in enumerate(tk0):
            ids = d["ids"]
            mask = d["mask"]
            target = d["target"]
            entity_indexes = d["entity_indexes"]
            sentence = d["sentence"]
            entity = d["entity"]

            ids = ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            target = target.to(device, dtype=torch.long)

            logits = model1(
                ids=ids,
                mask=mask,
                ent_indexes = entity_indexes
            )

            target = target.type_as(logits)

            test_predictions.append(logits.cpu().detach().numpy())

    test_predictions = np.concatenate( test_predictions, axis=0 ) 
        
    test_predictions = torch.sigmoid(torch.tensor(test_predictions)).numpy()
    pred = [1 if p>0.5 else 0 for p in test_predictions]
    pred = ['positive' if p == 1 else 'negative' for p in pred]
    df_test.loc[:,'Sentiment'] = pred
    df_test = df_test[['Sentence','Entity','Sentiment']]
    
    return df_test
 
df_test = test_model(df_test)  
    

EntitySentimentModel(
  (loss_fn): BCEWithLogitsLoss()
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((7

In [47]:
idx = random.choice(range(len(df_test)))
pprint(df_test.Sentence.iloc[idx])
pprint(df_test.Entity.iloc[idx])
pprint(df_test.Sentiment.iloc[idx])

('Taken out 2 car insurance policies today went through policies with customer '
 'advisor Paul who was very helpful and explained details to me so i could '
 'easily understand.')
'customer advisor'
'positive'


In [None]:
df_test.to_csv('test_pred.csv',index=None)