The approach is adapted from the sources https://nlpiation.medium.com/is-it-possible-to-do-sentiment-analysis-on-unlabeled-data-using-bert-feat-vader-experiment-357bba53768c and https://huggingface.co/monsoon-nlp/hindi-tpu-electra

# Installing and importing packages

In [1]:
from nltk.tokenize import word_tokenize, RegexpTokenizer
import re
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import nltk
nltk.download("vader_lexicon")
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import torch
from transformers import BertForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from simpletransformers.classification import ClassificationModel







[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/sanleypeter/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Init Plugin
Init Graph Optimizer
Init Kernel


# Reading the dataframe

In [2]:
df = pd.read_csv("../../Thesis-Code/lyrics_1051_cleaned.csv", index_col = [0])
temp2 = df

## Convert the sentiment label in the dataframe from {1, -1} to {1, 0}

In [3]:
def convert_label(inp):
    return 0 if inp == -1.0 else 1
  
temp2["sentiment"] = temp2["polarity"].apply(lambda x: convert_label(x))

In [4]:
temp2['sentiment'].value_counts()

1    587
0    463
Name: sentiment, dtype: int64

In [5]:
test_df = temp2[['eng_cleaned', 'sentiment']]

## Split the dataset into training, validation and test set

In [6]:
train, test = train_test_split(test_df, test_size=0.2, shuffle = True, random_state = 8)
train, val = train_test_split(train, test_size=0.25, random_state= 8)

In [7]:
train.shape

(630, 2)

In [8]:
val.shape

(210, 2)

In [9]:
test.shape

(210, 2)

# VADER analysis

In [10]:
analyzer = SentimentIntensityAnalyzer()

def vader_sentiment_result(sent):
    scores = analyzer.polarity_scores(sent)
    
    if scores["neg"] > scores["pos"]:
        return 0

    return 1

train["vader_result"] = train["eng_cleaned"].apply(lambda x: vader_sentiment_result(x))
val["vader_result"] = val["eng_cleaned"].apply(lambda x: vader_sentiment_result(x))

# BERT

## Training

Please note that training the BERT model will take a long time. Training of even 5 epochs took around 2-3 hours.

In [11]:
# Load the BERT Tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# The dataset class
class TheDataset(torch.utils.data.Dataset):

    def __init__(self, reviews, sentiments, tokenizer):
        self.reviews    = reviews
        self.sentiments = sentiments
        self.tokenizer  = tokenizer
        self.max_len    = tokenizer.model_max_length
  
    def __len__(self):
        return len(self.reviews)
  
    def __getitem__(self, index):
        review = str(self.reviews[index])
        sentiments = self.sentiments[index]

        encoded_review = self.tokenizer.encode_plus(
            review,
            add_special_tokens    = True,
            max_length            = self.max_len,
            return_token_type_ids = False,
            return_attention_mask = True,
            return_tensors        = 'pt',
            padding               = "max_length",
            truncation            = True
        )

        return {
            'input_ids': encoded_review['input_ids'][0],
            'attention_mask': encoded_review['attention_mask'][0],
            'labels': torch.tensor(sentiments, dtype=torch.long)
        }

# Prepare the Train/Validation sets
train_dataset = TheDataset(
    reviews    = train.eng_cleaned.tolist(),
    sentiments = train.vader_result.tolist(),
    tokenizer  = tokenizer,
)

val_dataset = TheDataset(
    reviews    = val.eng_cleaned.tolist(),
    sentiments = val.vader_result.tolist(),
    tokenizer  = tokenizer,
)

# Load the BERT model
model = BertForSequenceClassification.from_pretrained("bert-large-uncased")

# Freeze BERT except (the 24th layer + the last pooler layer)
for name, param in model.bert.named_parameters():
    if ( not name.startswith('pooler') ) and "layer.23" not in name :
        param.requires_grad = False

# The function to get the accuracy
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Define the training parameters
training_args = TrainingArguments(
    output_dir                  = "../../Models/sa-bert",
    num_train_epochs            = 5,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size  = 64,
    logging_strategy            = "steps",
    logging_steps               = 50,
    warmup_steps                = 500,
    weight_decay                = 0.01,
    save_strategy               = "steps",
    evaluation_strategy         = "steps",
    save_total_limit            = 2,
    load_best_model_at_end      = True,
    metric_for_best_model       = "accuracy",
    report_to                   = "tensorboard"
)

# Define the Huggingface Trainer object
trainer = Trainer(
    model           = model,
    args            = training_args,
    train_dataset   = train_dataset,
    eval_dataset    = val_dataset,
    compute_metrics = compute_metrics
)

# Start pre-training!
trainer.train()
trainer.save_model("../../Models/sa-bert")

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a

Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
50,0.6588,0.590519,0.72381,0.839779,0.72381,1.0
100,0.6416,0.586808,0.72381,0.839779,0.72381,1.0
150,0.6563,0.588454,0.72381,0.839779,0.72381,1.0
200,0.6398,0.585371,0.72381,0.839779,0.72381,1.0


***** Running Evaluation *****
  Num examples = 210
  Batch size = 64
***** Running Evaluation *****
  Num examples = 210
  Batch size = 64
***** Running Evaluation *****
  Num examples = 210
  Batch size = 64
***** Running Evaluation *****
  Num examples = 210
  Batch size = 64


Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to ../../Models/sa-bert
Configuration saved in ../../Models/sa-bert/config.json
Model weights saved in ../../Models/sa-bert/pytorch_model.bin


## Prediction using model loaded from checkpoint

In [12]:
# Load the checkpoint
model = BertForSequenceClassification.from_pretrained("../../Models/sa-bert/checkpoint-500")

# Make the test set ready
test_set_dataset = TheDataset(
    reviews    = test.eng_cleaned.tolist(),
    sentiments = test.sentiment.tolist(),
    tokenizer  = tokenizer,
)

training_args = TrainingArguments(
    output_dir = "../../Models/sa-bert",
    do_predict = True
)

trainer = Trainer(
    model           = model,
    args            = training_args,
    compute_metrics =compute_metrics,
)

trainer.predict(test_set_dataset)

loading configuration file ../../Models/sa-bert/checkpoint-500/config.json
Model config BertConfig {
  "_name_or_path": "bert-large-uncased",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.21.0.dev0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file ../../Models/sa-bert/checkpoint-500/pytorch_model.bin
All model checkpoint weights were used when initializing BertForSequenceClassification.

Al

PredictionOutput(predictions=array([[ 1.2506185 , -0.925302  ],
       [ 0.87508434,  0.0354313 ],
       [-1.3523662 ,  1.8445715 ],
       [-0.04889242,  0.6208542 ],
       [ 0.08563431, -0.01345127],
       [ 0.31334403,  0.12422984],
       [-1.0688744 ,  1.4775962 ],
       [-1.1636331 ,  1.576093  ],
       [ 0.19966732,  0.02564472],
       [-1.3416245 ,  1.8548858 ],
       [ 1.6542152 , -0.72417396],
       [ 1.3242314 , -0.9888401 ],
       [-1.2153888 ,  2.1986277 ],
       [-0.3131896 ,  0.8989139 ],
       [-1.0129565 ,  1.4591366 ],
       [-0.11946747,  0.6226268 ],
       [ 1.537325  , -0.9386224 ],
       [-0.6210219 ,  0.7767081 ],
       [-1.1675221 ,  1.6404217 ],
       [-0.5500312 ,  1.1580622 ],
       [-1.4410373 ,  2.0214405 ],
       [-1.9771341 ,  2.6809623 ],
       [ 0.6561237 , -0.6479999 ],
       [ 0.5837761 , -0.08491136],
       [ 0.83939075, -0.38641876],
       [-0.33812335,  0.43493193],
       [-0.46921942,  1.0689955 ],
       [-0.22221366,  1.18

# MBERT

## Splitting the dataset to training and test set for mBERT

In [13]:
test_df2 = temp2[['hin_cleaned','sentiment']]
train2, test2 = train_test_split(test_df, test_size=0.2, shuffle = True, random_state = 8)

## Training mBERT

In [14]:
bert = ClassificationModel('bert', 'bert-base-multilingual-uncased', num_labels=2, use_cuda=False, args={
    'reprocess_input_data': True,
    'use_cached_eval_features': False,
    'overwrite_output_dir': True,
    'num_train_epochs': 8,
    'silent': False
})
bert.train_model(train2)

loading configuration file https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json from cache at /Users/sanleypeter/.cache/huggingface/transformers/af4e101d208f361f141144dca21e9c4148aaf0e85441c2e335743d10829c6cad.d63adade93e44e64bedd306ec82ffd33eedabaf0ff08aabe581acaa48616a508
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/840 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Init Plugin
Init Plugin
Init Graph Optimizer
Init Graph Optimizer
Init Kernel
Init Kernel




Epoch:   0%|          | 0/8 [00:00<?, ?it/s]

Running Epoch 0 of 8:   0%|          | 0/105 [00:00<?, ?it/s]

Configuration saved in outputs/checkpoint-105-epoch-1/config.json
Model weights saved in outputs/checkpoint-105-epoch-1/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-105-epoch-1/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-105-epoch-1/special_tokens_map.json


Running Epoch 1 of 8:   0%|          | 0/105 [00:00<?, ?it/s]

Configuration saved in outputs/checkpoint-210-epoch-2/config.json
Model weights saved in outputs/checkpoint-210-epoch-2/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-210-epoch-2/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-210-epoch-2/special_tokens_map.json


Running Epoch 2 of 8:   0%|          | 0/105 [00:00<?, ?it/s]

Configuration saved in outputs/checkpoint-315-epoch-3/config.json
Model weights saved in outputs/checkpoint-315-epoch-3/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-315-epoch-3/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-315-epoch-3/special_tokens_map.json


Running Epoch 3 of 8:   0%|          | 0/105 [00:00<?, ?it/s]

Configuration saved in outputs/checkpoint-420-epoch-4/config.json
Model weights saved in outputs/checkpoint-420-epoch-4/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-420-epoch-4/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-420-epoch-4/special_tokens_map.json


Running Epoch 4 of 8:   0%|          | 0/105 [00:00<?, ?it/s]

Configuration saved in outputs/checkpoint-525-epoch-5/config.json
Model weights saved in outputs/checkpoint-525-epoch-5/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-525-epoch-5/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-525-epoch-5/special_tokens_map.json


Running Epoch 5 of 8:   0%|          | 0/105 [00:00<?, ?it/s]

Configuration saved in outputs/checkpoint-630-epoch-6/config.json
Model weights saved in outputs/checkpoint-630-epoch-6/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-630-epoch-6/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-630-epoch-6/special_tokens_map.json


Running Epoch 6 of 8:   0%|          | 0/105 [00:00<?, ?it/s]

Configuration saved in outputs/checkpoint-735-epoch-7/config.json
Model weights saved in outputs/checkpoint-735-epoch-7/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-735-epoch-7/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-735-epoch-7/special_tokens_map.json


Running Epoch 7 of 8:   0%|          | 0/105 [00:00<?, ?it/s]

Configuration saved in outputs/checkpoint-840-epoch-8/config.json
Model weights saved in outputs/checkpoint-840-epoch-8/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-840-epoch-8/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-840-epoch-8/special_tokens_map.json
Configuration saved in outputs/config.json
Model weights saved in outputs/pytorch_model.bin
tokenizer config file saved in outputs/tokenizer_config.json
Special tokens file saved in outputs/special_tokens_map.json


(840, 0.6729542425345807)

## Prediction and calculating the accuracy

In [15]:
result, model_outputs, wrong_predictions = bert.eval_model(test2)
bads = {}
for pred in wrong_predictions:
    if pred.label in bads:
        bads[pred.label] += 1
    else:
        bads[pred.label] = 1
print("wrong predictions")
print(str(len(wrong_predictions)) + ' wrong out of ' + str(len(test2)))
bads



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/210 [00:00<?, ?it/s]

	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Init Plugin
Init Graph Optimizer
Init Kernel


Running Evaluation:   0%|          | 0/27 [00:00<?, ?it/s]

wrong predictions
82 wrong out of 210


{0: 67, 1: 15}

In [17]:
(1 - (82/210))*100       #accuracy of the model (1 - (wrong_pred / total_pred)) * 100

60.952380952380956