# Sentiment Analysis with Deep Learning using BERT

In [None]:
import torch
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np # linear algebra
from tqdm.notebook import tqdm

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))# df.set_index('id', inplace=True)
# 
df = pd.read_csv('/kaggle/input/emotion-detection-from-text/tweet_emotions.csv')

# Let's have a look at it
df.head()

In [None]:
df.content.iloc[-10:]

In [None]:
df.sentiment.value_counts()
#nocode is simply no clear emotions in this tweet

In [None]:
#we want to remove small datasets 
# df = df[~df.sentiment.str.contains('\|')]   
df = df[df.sentiment != 'anger'] #& 'boredom' & 'enthusiasm' & 'empty'
df = df[df.sentiment != 'boredom']
df = df[df.sentiment != 'enthusiasm']
df = df[df.sentiment != 'empty']
df = df[df.sentiment != 'sentiment'] #there is sentiment in sentiments!

In [None]:
df.sentiment.value_counts()
#class imbalance

In [None]:
#build dictionary, key: emotion, value: 
possible_labels = df.sentiment.unique()

In [None]:
label_dict = {}
#loop over index
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
print(label_dict)

In [None]:
#build new column for these values
df['label'] = df.sentiment.replace(label_dict)
df.head()

## Task 3: Training/Validation Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
#stratified split
X_train, X_val, y_train, y_val = train_test_split(df.index.values,
                                                 df.label.values,
                                                 test_size = 0.15,
#                                                  random_state=17,
                                                 stratify = df.label.values
                                                 )

In [None]:
df['data_type'] = ['not_set']*df.shape[0]

In [None]:
df.head()

In [None]:
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

In [None]:
df.groupby(['sentiment', 'label', 'data_type']).count()
#group by using count

# **PREPROCESSING**

Import the libraries

In [None]:
# Install spaCy (run in terminal/prompt)
# import sys
# !{sys.executable} -m pip install spacy

# Download spaCy's  'en' Model
# !{sys.executable} -m spacy download en

# !pip install -U symspellpy

#for spell and slang correction
# !pip install gingerit
# from gingerit.gingerit import GingerIt

#for emoticons
!pip install emot --upgrade
import emot 
emot_obj = emot.core.emot() 

# from symspellpy.symspellpy import SymSpell, Verbosity
# import pkg_resources
import re, string, json
# import spacy

In [None]:

contraction_list = json.loads(open('/kaggle/input/english-contractions/english_contractions.json.txt', 'r').read())
character_entity= {'&lt;3':'heart', '&amp;':'and','&quot;':' quote '}
contraction_list = {**contraction_list, **character_entity}


def normalization_pipeline(sentences):
    print("##############################")
    print("Starting Normalization Process")
    sentences = _simplify_punctuation_and_whitespace(sentences) # !!!!! "      "
    sentences = _normalize_contractions(sentences) #also corrects spelling now
    print("Normalization Process Finished")
    print("##############################")
    return sentences

    
def _simplify_punctuation_and_whitespace(sentence_list):
    """
    words with more than 4 all-capital words will get <-EMPW 
    """
    norm_sents = []
    print("Replacing -URL- , Replacing @MENTION and #HASHTAG, Reducing character repetitions, ")
    print("Simplifying punctuation, Removing whitespaces")

    for sentence in tqdm(sentence_list):
        sent = _replace_urls(sentence)
        sent = _mention_hash(sent)
        sent = _simplify_punctuation(sent)
        sent = _reduce_repetitions(sent)
        sent = _normalize_whitespace(sent)
        norm_sents.append(sent)
    return norm_sents


def _replace_urls(text):
    url_regex = r'(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})'
    text = re.sub(url_regex, "-URL-", text)
    return text


def _mention_hash(in_str):
    """
     @MENTIONs and #HASHTAGs will take forms of @men and #has 
    note: BEWARE OF USES OF # AND @ AND SPACES BETWEEN THEM
    """
    in_str = str(in_str)
    in_str = re.sub('@\w+', '@MEN', in_str,flags=re.IGNORECASE) # use @\w+ for word replacement or @ with space after @MEN for keeping mention
    in_str = re.sub('#', '#HAS ', in_str,flags=re.IGNORECASE)
#     in_str = re.sub(r'([\w])\1+', r'\1\1', in_str) #reduce repeated characters to 2
    return in_str.strip()


def _simplify_punctuation(text):
    """
    puntuations like '!!!!!' will be transformed into '!! <-EMPP'
    This function simplifies doubled or more complex punctuation. The exception is '...'. #?! ??? !!!
    """
    corrected = str(text)
    corrected = re.sub(r'([!?,;])\1+', r'\1\1 <-EMPP', corrected) #\1\1 makes it to 2 consecutive punctuation
    corrected = re.sub(r'\.{2,}', r'...', corrected)
    return corrected

def _reduce_repetitions(text):
    """
    Auxiliary function to help with exxagerated (repeated characters in) words.
    Examples:
        woooooords -> woords <-EMPW
        dooorwaaay -> doorwaay <-EMPW
        SICK -> sick <-EMPU
    """
    correction = str(text)
    for index, words in enumerate(str(text).split()):
        if _is_EMP_word(words)==True :
            #insert EMPW after word
            correction = correction.replace(words, words + ' <-EMPW')
        if (len(words) > 4) & (words.isupper()==True) & (words[0] not in string.punctuation):
            correction = correction.replace(words, words + ' <-EMPU')
    #TODO work on complexity reduction.
    return re.sub(r'([\w])\1+', r'\1\1', correction) #\1\1 will only keep 2 consecutive characters


def _is_EMP_word(word):
    """
    True/ False: checks if the word has 3 consecutive characters"""
    count=1
    if len(word)>1:
        for i in range(1,len(word)):
            if word[i] in string.punctuation: #this function is only for words!
                return False
            if word[i-1]==word[i]:
                count+=1
                if(count>=3):
                     return True
            else :
                if(count>=3):
                    return True
                count=1
    else :
        return False
    return False


def _normalize_whitespace(text):
    """
    normalizes whitespaces, removing duplicates.
    """
    corrected = str(text)
    corrected = re.sub(r"//t",r"\t", corrected)
    corrected = re.sub(r"( )\1+",r"\1", corrected)
    corrected = re.sub(r"(\n)\1+",r"\1", corrected)
    corrected = re.sub(r"(\r)\1+",r"\1", corrected)
    corrected = re.sub(r"(\t)\1+",r"\1", corrected)
    return corrected.strip(" ")
    
    
#Substitution of contractions:  -----------------------------------------------------------------------------------------------      
def _normalize_contractions(sentence_list):
    """
    it will correct each word in a sentence for slangs(ginger), emojis -> meaning, entity references and abbreviations(json file) : file can be manually modified above
    also makes everything lowercase (including EMPW,EMPU, EMPP, URL, etc)
    """
    #uses contraction_list (a json file) BE SURE TO IMPORT IT ALREADY
    norm_sents = []
    print("Normalizing contractions, abbreviations, slangs, emojis, character entities")
    for sentence in tqdm(sentence_list):
        norm_sents.append(_normalize_contractions_slang_emoji_entity(sentence))
    return norm_sents

def _normalize_contractions_slang_emoji_entity(text):
    """
    part1:normalizes english contractions.
    """
    contractions = contraction_list
    for word in text.split():
         if word.lower() in contractions:
            text = text.replace(word, contractions[word.lower()])
#             print('replacing contraction: '+ word + ' to '+contractions[word.lower()])
    """
    part 2: using gingerit SMS slang correction:
    this is too slow and can take many hours for the whole dataset to run
    """
#     parser = GingerIt()
#     result=parser.parse(text)
#     # corrections = result['corrections']
#     sentence = result['result']
    sentence = text
    """
    part3: emoji and character entity reference conversion to meaning
    """
#     if emot_obj.emoji(sentence)['value'] !=[] : #we do not have emojis in this database text
#         print("found emoji: "+str(emot_obj.emoji(sentence)['value'])+ sentence)
    emoticons = emot_obj.emoticons(sentence)
#     if((emoticons['value']!=[]) ): #for printing
#         print("found: "+str(emoticons['value']) +'  emoticons in:   '+ sentence) 
    for i in range(0,len(emoticons['value'])):
#         print('replacing  ' + emoticons['value'][i] + '  with ' +  emoticons['mean'][i])
        sentence = sentence.replace(emoticons['value'][i], emoticons['mean'][i])
    """
    part4: make everything lowercase
    """
    sentence = sentence.lower()
    return sentence



#uncomment for data normalization (else just load from the data)

In [None]:
#assessment and examples:
# # original_examples = ['hi @someone WATCH me #proud :) ;) ...... i h8 it bt w8 !!!!!  <3  wanna go &amp; &lt;3 tHeRe  &quot; bcs my finls clooooose &quot;bananas&quot; &amp; ']
# original_examples=df.content[0:10]
# preprocessed_examples = normalization_pipeline(original_examples)
# for example_index,example in enumerate(preprocessed_examples):
# #     print(original_examples[example_index])
#     print(original_examples.values[example_index])
#     print(example)


    
#run preprocessing
# df_original=df
# df.content=normalization_pipeline(df.content.values ) #about 10 minutes to run

#save
# df.to_csv('df_processed.csv',index=False)

#load
df = pd.read_csv('/kaggle/input/english-contractions/df_processed.csv')



the rest can give error

In [None]:
import matplotlib.pyplot as plt
from collections import Counter

tokenizee=[]
for words in tqdm(range(1,len(df.content)-1)):
    tokenizee.append(spacy_process(df.content[words]))
    
#overview of preprocessed data
words = Counter()
for s in tokenizee:
  for w in s:
    words[w] += 1

sorted_words = list(words.keys())
sorted_words.sort(key=lambda w: words[w], reverse=True)
print(f"Number of different Tokens in our Dataset: {len(sorted_words)}")
print(sorted_words[:100])


count_occurences = sum(words.values())
accumulated = 0
counter = 0
while accumulated < count_occurences * 0.8:
  accumulated += words[sorted_words[counter]]
  counter += 1

print(f"The {counter * 100 / len(words)}% most common words "
      f"account for the {accumulated * 100 / count_occurences}% of the occurrences")

plt.bar(range(100), [words[w] for w in sorted_words[:100]])
plt.show()

In [None]:
from transformers import BertTokenizer
from torch.utils.data import TensorDataset
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                         #all lower case
                                         do_lower_case = True,
                                         )

TRY DIFFERENT MAX_LENGTHS
here for tweets I used 50

In [None]:
#batch using multiple strings and convert them into tokens
encoded_data_train = tokenizer.batch_encode_plus(
        df[df.data_type == 'train'].content.values,
        add_special_tokens = True,
        #to know when sentence begins and ends
        return_attention_mask = True,
        #set max length to large values for big sentences
        padding = True,
        truncation=True, ###
        max_length = 40,
        return_tensors = 'pt'
        #pt: pytorch
        )

encoded_data_val = tokenizer.batch_encode_plus(
        df[df.data_type == 'val'].content.values,
        add_special_tokens = True,
        #to know when sentence begins and ends
        return_attention_mask = True,
        #set max length to large values for big sentences
        padding = True,
        truncation=True, ###
        max_length = 40,
        return_tensors = 'pt'
        #pt: pytorch
        )

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type == 'train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type == 'val'].label.values)

preprocess the data lemmatization

In [None]:
for eachS in range(0,5):
    print(tokenizer.decode(input_ids_train[eachS]))


In [None]:
dataset_train = TensorDataset(input_ids_train,
                              attention_masks_train,
                              labels_train)

dataset_val = TensorDataset(input_ids_val,
                              attention_masks_val,
                              labels_val)

In [None]:
len(dataset_train)

In [None]:
len(dataset_val)

## Task 5: Setting up BERT Pretrained Model

In [None]:
from transformers import BertForSequenceClassification

In [None]:
#each sequence will be dealt separate classification
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    #the other cased one is larger and takes more computation power
    #we want to fine tune the parts we need
    num_labels = len(label_dict),
    output_attentions = False,
    output_hidden_states = False
                                     )
#450 MB needs to be fetched and loaded into memory
#bert takes into text and encodes into meaningful way according to the huge corpus it was intitially exposed to
#we are just lying on top of it to get our 6 classes classifier

## Task 6: Creating Data Loaders

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler     #will use for training
 
                    #will use for our validation dataset, gradients are fixed

BATCH SIZE CAN BE INCREASED:

In [None]:
batch_size = 8   #very small due to machine low specs but can increase to 32

dataloader_train = DataLoader(
            dataset_train,
            sampler = RandomSampler(dataset_train),
            #to avoid it learning from any sequences
            batch_size = batch_size
            )

dataloader_val = DataLoader(
            dataset_val,
            sampler = RandomSampler(dataset_val),
            #to avoid it learning from any sequences
            batch_size = 32    #here no many computation, no backpropagation
            )

## Task 7: Setting Up Optimizer and Scheduler

In [None]:
#Optimizer defines our learning rate and how it changed throught each epoch
from transformers import AdamW, get_linear_schedule_with_warmup
#Adam with weight decay, stochastic optimizer

LEARNING RATE:

In [None]:
optimizer = AdamW(
                model.parameters(),
                lr = 1e-5,         #recommended: 2e-5 > 5e-5
                eps = 1e-8,
                )

NUMBER OF EPOCHS

In [None]:
epochs = 2

schedular = get_linear_schedule_with_warmup(
        optimizer,     #Adam
        num_warmup_steps = 0,
        num_training_steps = len(dataloader_train)*epochs
        )

## Task 8: Defining our Performance Metrics

Accuracy metric approach originally used in accuracy function in [this tutorial](https://mccormickml.com/2019/07/22/BERT-fine-tuning/#41-bertforsequenceclassification).

In [None]:
import numpy as np

In [None]:
from sklearn.metrics import f1_score

preds = [0.9 0.05 0.05 0 0 0]

we want to convert it to [1 0 0 0 0 0]

In [None]:
#f1-score is good bec. of class imbalance
#accuracy alone will give me skewed results,
    #based on f1-score not actually representing what we want

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis = 1).flatten()
    #flatten to get single list and not array
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average = 'weighted')
#can changed weighted to macro

In [None]:
def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis = 1).flatten()
    labels_flat = labels.flatten()
    
    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat == label]
#here we are using numpy indexing to index 2 array of the same shape by each other
        y_true = labels_flat[labels_flat == label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy:  {len(y_preds[y_preds == label])}/{len(y_true)}\n')

## Task 9: Creating our Training Loop

Approach adapted from an older version of HuggingFace's `run_glue.py` script. Accessible [here](https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128).

In [None]:
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#send model to device we are using
model.to(device)
print(device)

In [None]:
torch.cuda.is_available()

In [None]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in tqdm(dataloader_val):
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals


# BEHOLD THE TRAINING!!!

In [None]:
print(f'total epochs:{epochs}')
for epoch in tqdm(range(1, epochs+1)):
    print(f'epoch # {epoch}')
    model.train()
    
    loss_train_total = 0
    #we set it initially as 0
    
    progress_bar = tqdm(dataloader_train,
                        desc = f'Epoch {epoch}',
                        leave = False,   #overwrite after each epoch
                        disable = False                        
                       )
    #to see where are we, has it crashed
    
    for batch in progress_bar:
#         print(f"{}")
        model.zero_grad()
        #gradient set to zero
        
        batch = tuple(b.to(device) for b in batch)
        #this is imp for cuda gpu use
        
        inputs = {
            'input_ids':         batch[0],
            'attention_mask':    batch[1],
            'labels' :           batch[2]
        }
        
        outputs = model(**inputs)
        #outputs dictionary directly into inputs
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward() #?
        
        
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        #clip our gradient
        #take gradient and give it normal value that we provide as 1
        #stop gradients from slipping into becoming exceptionally small or too big
        #promote generalization
        
        optimizer.step()
        schedular.step()
#         stroftrainloss = loss.item()/len(batch)
#         progress_bar.set_postfix(f'training_loss: {stroftrainloss}')
        #append small dictionary
        
#     torch.save(model.state_dict(), f'/kaggle/working/Bert_ft_epoch{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
        
    loss_train_avg = loss_train_total/len(dataloader_train)
    tqdm.write(f'Training loss: {loss_train_avg}')
        
    val_loss, predictions, true_vals = evaluate(dataloader_val)
        #this is imp if over training
        #model will have no generalization abilities
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (weighted): {val_f1}')
        
#Cpu takes 40 minutes
#gpu takes 30 seconds

## Task 10: Loading and Evaluating our Model

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

In [None]:
model.to(device)
pass   #to not get alot of text output

In [None]:
model.load_state_dict(
    torch.load('Models/finetuned_bert_epoch_1_gpu_trained.model',
              map_location = torch.device('cpu')))

In [None]:
_, prediction, true_vals = evaluate(dataloader_val)
#7 batches
#will take almost 2 minutes

In [None]:
accuracy_per_class(prediction, true_vals)

In [None]:
#To summarize:
# model was trained on Google colab --GPU Instance(k80)
# batch size  = 32
# epoch = 10