In [2]:
#Import required Libraries 

import torch 
import numpy as np 
import os 
import random
import pandas as pd 
from tqdm.notebook import tqdm 
from sklearn.model_selection import train_test_split 
from transformers import BertTokenizer 
from torch.utils.data import TensorDataset 
from transformers import BertForSequenceClassification 
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler 
from sklearn.metrics import f1_score 

from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW

#Loading data from Google drive 
#from google.colab import drive 
#drive.mount('/content/drive') 
#os.chdir("ENTER LOCATION WHERE DATASET IS.") # EXAMPLE: /content/drive/My Drive/Sentiment_analysis_using_BERT 

device = torch.device('cpu')
#
if torch.cuda.is_available() and device == 'cuda':
    device = torch.device('cuda')
elif torch.has_mps:
    device = torch.device('mps')
else:
    device = torch.device('cpu')

#device = torch.device('cpu')

print(f'Device is: {device}')
df = pd.read_csv('../data/clean/NPS_NATL_subset.csv')

df.set_index('respid2', inplace = True) 
df.head()


Device is: cpu


Unnamed: 0_level_0,Location,Workforce,NPS® Breakdown,NPS_Code,NPSCommentCleaned,NPSCommentLemmatised,NPSCommentPolarity,NPSCommentSubjectivity,OverallCommentCleaned,OverallCommentLemmatised,OverallCommentPolarity,OverallCommentSubjectivity
respid2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
7053157,1763,Precinct,Promoter,2,staff in store in person close by when need them,"Staff in store, in person, close by when I nee...",0.0,0.0,was told the issue is more related to gmail th...,Was told the issue is more related to gmail th...,0.25,0.45
6984308,832,Precinct,Promoter,2,adieb anbari was beyond helpful he answered al...,Adieb Anbari was beyond helpful . He answered ...,0.1,0.6,adieb was great would recommend him to help an...,Adieb was great I would recommend him to help ...,0.8,0.75
6980586,288,Precinct,Promoter,2,quick and knowledgeable,Quick and knowledgeable,0.333333,0.5,xyxyxz,xyxyxz,0.0,0.0
6789897,168,Precinct,Promoter,2,he called back quickly within minutes and was ...,He called back quickly (within 5 minutes) and ...,0.414444,0.426667,appreciate his quick and knowledgeable response,Appreciate his quick and knowledgeable response.,0.333333,0.5
6896172,836,Precinct,Promoter,2,had really good experience thanks to your tech...,I had a really good experience thanks to your ...,0.45,0.4,ricky finished with his prior appt so he took ...,Ricky finished with his prior appt so he took ...,0.05,0.15


In [3]:
df.NPS_Code.value_counts()


0    10017
2     8901
1     7785
Name: NPS_Code, dtype: int64

In [4]:
df.head()

Unnamed: 0_level_0,Location,Workforce,NPS® Breakdown,NPS_Code,NPSCommentCleaned,NPSCommentLemmatised,NPSCommentPolarity,NPSCommentSubjectivity,OverallCommentCleaned,OverallCommentLemmatised,OverallCommentPolarity,OverallCommentSubjectivity
respid2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
7053157,1763,Precinct,Promoter,2,staff in store in person close by when need them,"Staff in store, in person, close by when I nee...",0.0,0.0,was told the issue is more related to gmail th...,Was told the issue is more related to gmail th...,0.25,0.45
6984308,832,Precinct,Promoter,2,adieb anbari was beyond helpful he answered al...,Adieb Anbari was beyond helpful . He answered ...,0.1,0.6,adieb was great would recommend him to help an...,Adieb was great I would recommend him to help ...,0.8,0.75
6980586,288,Precinct,Promoter,2,quick and knowledgeable,Quick and knowledgeable,0.333333,0.5,xyxyxz,xyxyxz,0.0,0.0
6789897,168,Precinct,Promoter,2,he called back quickly within minutes and was ...,He called back quickly (within 5 minutes) and ...,0.414444,0.426667,appreciate his quick and knowledgeable response,Appreciate his quick and knowledgeable response.,0.333333,0.5
6896172,836,Precinct,Promoter,2,had really good experience thanks to your tech...,I had a really good experience thanks to your ...,0.45,0.4,ricky finished with his prior appt so he took ...,Ricky finished with his prior appt so he took ...,0.05,0.15


In [5]:
possible_labels = df.NPS_Code.unique()
label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index

df['label'] = df.NPS_Code.copy()
df['text'] = df['NPSCommentCleaned'].astype(str)
df = df.filter(['respid', 'label', 'text'], axis=1)
df.head()

Unnamed: 0_level_0,label,text
respid2,Unnamed: 1_level_1,Unnamed: 2_level_1
7053157,2,staff in store in person close by when need them
6984308,2,adieb anbari was beyond helpful he answered al...
6980586,2,quick and knowledgeable
6789897,2,he called back quickly within minutes and was ...
6896172,2,had really good experience thanks to your tech...


In [6]:
X_train, X_val, y_train, y_val = train_test_split(df.index.values, df.label.values, test_size=0.15, random_state=17, stratify=df.label.values) 

df['data_type'] = ['not_set']*df.shape[0]   #CREATING A NEW COLUMN IN DATASET AND SETTING ALL VALUES TO 'not_set' 

df.loc[X_train, 'data_type'] ='train' #CHECKING AND SETTING data_type TO TRAIN 
df.loc[X_val, 'data_type'] = 'val' #CHECKING AND SETTING data_type TO VAL


df.groupby(['text', 'label', 'data_type']).count() #TO CHECK WHICH CATEGORY DATA IS IN WHICH data_type

text,label,data_type
aaron did great job taking me ahead of my appointment and solved my problem,2,train
aaron was helpful at checkin and barry was pleasant and knowledgeable at checkout,1,train
abby was very helpful and informative,2,train
abdullah bhatti who helped me was very helpful and accommodating in fixing my membership which was not in the system although had receipt showing my payment he also resolved the issue with my laptop and explained clearly what caused the problem,2,train
ability to help with serious scam that that affected my bank account,2,train
...,...,...
zero help with problem purchase,0,train
zero transparency was promised days took almost days to get my phone back,0,train
zoe is always pleasure to deal with always knowledgeable and always right,2,train
zoey was pleasure to work with she was extremely knowledgeable and showed great customer service skills,2,train


In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

#ENCODING DATA
encoded_data_train = tokenizer.batch_encode_plus(df[df.data_type=='train'].text.values,
                                                add_special_tokens=True,
                                                return_attention_mask=True,
                                                truncation=True,
                                                padding='max_length',
                                                max_length=256,
                                                 return_tensors='pt'
                                                )
encoded_data_val = tokenizer.batch_encode_plus(df[df.data_type=='val'].text.values,
                                                add_special_tokens=True,
                                                return_attention_mask=True,
                                                truncation=True,
                                                padding='max_length',
                                                max_length=256,
                                                 return_tensors='pt'
                                                )

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].label.values)

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [8]:
#SETTING MODEL
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3, output_attentions=False, output_hidden_states=False )
model.to(device)
#CREATING DATA LOADERS
dataloader_train = DataLoader(dataset_train,sampler = RandomSampler(dataset_train), batch_size= 32)
dataloader_val = DataLoader(dataset_val, sampler = RandomSampler(dataset_val), batch_size= 32)
      

#SETTING OPTIMIZERS

op = AdamW(model.parameters(),lr=1e-5,eps=1e-8)

epochs = 4

scheduler = get_linear_schedule_with_warmup(op, num_warmup_steps=10, num_training_steps=len(dataloader_train)*epochs)
#model

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [9]:
#FUNCTION TO CALCULATE F1 SCORE
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds,axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average = 'weighted')

#FUNCTION FOR CALCULATING ACCURACY PER CLASS
def accuracy_per_class(preds, labels):
    label_dict_inverse = {v:k for k,v in label_dict.items()}
    
    preds_flat = np.argmax(preds,axis=1).flatten()
    labels_flat = labels.flatten()
    
    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(label_dict_inverse[label])
        print("accuracy ", len(y_preds[y_preds==label])/len(y_true))

#FUNCTION FOR MODEL EVALUATION
def evaluate(dataloader_val):
    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }
        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [10]:
def predict(model, sentence):
    model.eval()
    
    result_np = []
    """
    inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }
    with torch.no_grad():        
        outputs = model(**inputs)        
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()
        logits = logits.detach().cpu().numpy()
    """

    encoded_data = tokenizer.encode_plus(text = sentence,
                                            add_special_tokens=True,
                                            return_attention_mask=True,
                                            truncation=True,
                                            padding='max_length',
                                            max_length=256,
                                            return_tensors='pt'
                                            )

    input_ids = encoded_data['input_ids']
    attention_masks = encoded_data['attention_mask']
    outputs = model(encoded_data)
    logits = outputs[1]
    pred = np.argmax(logits, axis = 1)
    # print(classification_report(test_y, pred))
    return(pred)

    
    #labels_train = torch.tensor(df[df.data_type=='train'].label.values)    
    """
    img = test_images[idx, :, :, :]
    img = np.expand_dims(img, axis=0)
    img = torch.Tensor(img).permute(0, 3, 1, 2).to(device)
    # print(img.shape)
    pred = self(img)
    pred_np = pred.cpu().detach().numpy()
    for elem in pred_np:
        result_np.append(elem)
    """
    return result_np


In [11]:

for epoch in tqdm(range(1, 2)):
    model.train()
    loss_train_total = 0
    
    progress_bar = tqdm(dataloader_train, 
                        desc ='Epoch {:1d}'.format(epoch),
                        leave=False,
                       disable=False
                       )
    for batch in progress_bar:
        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = { 'input_ids' : batch[0],
                 'attention_mask' : batch[1],
                 'labels' : batch[2]
                 }
        outputs = model(**inputs)
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        op.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss':'{:.3f}'.format(loss.item()/len(batch))})
    
    # THIS SECTION OF CODE IS JUST FOR PRINTING VALUES AFTER EACH EPOCH.
    torch.save(model.state_dict(), f'BERT_ft_epoch{epoch}.model')
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_val)
    val_f1 = f1_score_func(predictions, true_vals)
    
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 score (weighted): {val_f1}')  

  0%|          | 0/1 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/710 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [15]:
_, predictions, true_val = evaluate(dataloader_val)  #why _ ? reason behind this is evaluate function return 3 values and i don't require the 1st value i.e., loss_val_avg

accuracy_per_class(predictions, true_val)

2
accuracy  0.825016633399867
1
accuracy  0.2996575342465753
0
accuracy  0.9131086142322098


In [19]:
predict(model, "This is a terrible experience")

AttributeError: 

In [14]:
PATH = './BertModels/BERT_FT_epoch8.model'
#model = TheModelClass(*args, **kwargs)
#model.load_state_dict(torch.load(PATH))
#model.eval()


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [None]:
from transformers import BertTokenizer

# Load the tokenizer of the "bert-base-cased" pretrained model
# See https://huggingface.co/transformers/pretrained_models.html for other models
tz = BertTokenizer.from_pretrained("bert-base-uncased")

# The senetence to be encoded
sent = "Let's learn deep learning!"

# Encode the sentence
encoded = tz.encode_plus(
    text=sent,  # the sentence to be encoded
    add_special_tokens=True,  # Add [CLS] and [SEP]
    max_length = 64,  # maximum length of a sentence
    pad_to_max_length=True,  # Add [PAD]s
    return_attention_mask = True,  # Generate the attention mask
    return_tensors = 'pt',  # ask the function to return PyTorch tensors
)

# Get the input IDs and attention mask in tensor format
input_ids = encoded['input_ids']
attn_mask = encoded['attention_mask']