# Install and import neccessary Libraries

In [30]:
!pip install datasets
!pip install transformers
!pip install evaluate
!pip install tqdm
!pip install transformers[torch]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


**Import Libraries**

In [31]:
import pandas as pd
import numpy as np
import random
import re
import torch
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader
from transformers import AdamW, get_scheduler
from sklearn.metrics import accuracy_score, f1_score
from transformers import AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from transformers import TrainingArguments
from transformers import pipeline
from transformers import Trainer
from transformers import AutoTokenizer
from datasets import DatasetDict, Dataset
from tqdm.auto import tqdm
import evaluate

# Load and prepare data for model input

In [32]:
#connect to google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [33]:
#load the train data
train = pd.read_csv("/content/drive/MyDrive/submission/datasets/train.csv")

In [34]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3123 entries, 0 to 3122
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   train_id   3123 non-null   object
 1   comment    3123 non-null   object
 2   sentiment  3123 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 73.3+ KB


In [35]:
# Replace the values in the 'sentiment' column
mapping = {1: "positive", -1: "negative", 0: "neutral"}
train['sentiment'] = train['sentiment'].replace(mapping)

In [36]:
# Drop the 'id' column
train = train.drop('train_id', axis=1)

In [37]:
def clean_text(df, column_name):
    # Remove special characters and convert to lowercase
    df[column_name] = df[column_name].apply(lambda x: re.sub(r'http\S+|www\S+|https\S+|[^a-zA-Z0-9\s]', '', x.lower()))
    return df

In [38]:
# Clean the 'comment' column
train = clean_text(train, 'comment')

In [39]:
#check clean trainset
train.tail()

Unnamed: 0,comment,sentiment
3118,mwananchinews serikali haina ela ya bundle ya ...,neutral
3119,tunatangaza leo telecom3 money bila tozo hatua...,positive
3120,cloudsmedialive mbona maelezo mengi sana mnges...,negative
3121,hello telecom3tanzania kwanini nikiulizia sali...,negative
3122,sina utashi juu ya upangaji wa vifurushi husus...,negative


In [40]:
#creating dataset object
train_dataset = Dataset.from_dict({
    "text": train['comment'],
    "label": train['sentiment']

})

In [41]:
#Creating DatasetDict object
raw_data = DatasetDict({
    "train": train_dataset
})

In [42]:
# let's set a seed for reproducibility
SEED = 42
# let's create a validation set (20%) for finding the model with best generalizability
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, random_state=SEED, shuffle=True)
train_indices, val_indices = next(skf.split(raw_data['train']['text'], raw_data['train']['label']))

train_data = Dataset.from_dict(raw_data['train'][train_indices])
val_data = Dataset.from_dict(raw_data['train'][val_indices])

In [43]:
# Adding a validation set
Train_datasets = DatasetDict({
    'train': train_data,
    'validation': val_data,
})

In [44]:
# get class label information from dataset
num_labels = len(Train_datasets['train'].unique('label'))
id2label = {}
label2id = {}
for label_id,label in enumerate(Train_datasets['train'].unique('label')):
    id2label[label_id] = label
    label2id[label] = label_id

print(f"NUM_LABELS: {num_labels}")
print(f"ID2LABEL: {id2label}")
print(f"LABEL2ID: {label2id}")

NUM_LABELS: 3
ID2LABEL: {0: 'negative', 1: 'neutral', 2: 'positive'}
LABEL2ID: {'negative': 0, 'neutral': 1, 'positive': 2}


In [45]:
def preprocess(batch):

    batch['text'] = batch['text']
    # get one-hot encoded labels for each example in batch
    batch['labels'] = [[1.0 if label2id[label] == j else 0.0 for j in range(len(label2id))] for label in batch['label']]

    return batch

preprocessed_datasets = Train_datasets.map(preprocess, batched=True)
preprocessed_datasets = preprocessed_datasets.remove_columns(['label'])
preprocessed_datasets

Map:   0%|          | 0/2498 [00:00<?, ? examples/s]

Map:   0%|          | 0/625 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 2498
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 625
    })
})

In [46]:
#Initializing a huggingface model for tokenization and finetuning
CHECKPOINT = "mbeukman/xlm-roberta-base-finetuned-swahili-finetuned-ner-swahili"
tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)
tokenizer

XLMRobertaTokenizerFast(name_or_path='mbeukman/xlm-roberta-base-finetuned-swahili-finetuned-ner-swahili', vocab_size=250002, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)}, clean_up_tokenization_spaces=True)

In [47]:
# tokenize out datasets with truncation
tokenized_datasets = preprocessed_datasets.map(lambda batch: tokenizer(batch['text'], truncation=True), batched=True, remove_columns=['text'])
tokenized_datasets

Map:   0%|          | 0/2498 [00:00<?, ? examples/s]

Map:   0%|          | 0/625 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 2498
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 625
    })
})

# Model training

In [48]:
torch.manual_seed(SEED)

<torch._C.Generator at 0x7f08c1de1fd0>

In [49]:
#using huggingface open source model
model = AutoModelForSequenceClassification.from_pretrained(CHECKPOINT, problem_type='multi_label_classification',
                                                           num_labels=len(label2id), id2label=id2label, label2id=label2id)

Some weights of the model checkpoint at mbeukman/xlm-roberta-base-finetuned-swahili-finetuned-ner-swahili were not used when initializing XLMRobertaForSequenceClassification: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at mbeukman/xlm-roberta-base-finetuned-swahili-finetuned-ner-swahili and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight', '

In [50]:
# this function calculates accuracy per label in a prediction instead of per prediction
def samples_accuracy_score(y_true, y_pred):
    return np.sum(y_true==y_pred) / y_true.size

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    # we sigmoid all logits for multilabel metrics
    predictions = torch.nn.functional.sigmoid(torch.Tensor(logits))
    # we set threshold to 0.50 to classify positive >= 0.50 and negative < 0.50
    predictions = (predictions >= 0.50).int().numpy()



    # overall accuracy measures accuracy of each true label list and prediction list
    overall_accuracy = accuracy_score(labels, predictions)
    # sample accuracy measures accuracy of each true label in a true label list and prediction in prediction list
    samples_accuracy = samples_accuracy_score(labels, predictions)
     # overall f1 measures macro f1 of each true label list and prediction list, ignoring zero division warnings
    overall_f1 = f1_score(labels, predictions, average='macro', zero_division=0)
    # samples f1 measures f1 of each true label in a true label list and prediction in prediction list, ignoring zero division warnings
    samples_f1 = f1_score(labels, predictions, average='samples', zero_division=0)
    return {
        'overall_accuracy': overall_accuracy,
        'samples_accuracy': samples_accuracy,
        'overall_f1': overall_f1,
        'samples_f1': samples_f1,
    }

In [55]:
training_args = TrainingArguments(
    seed=SEED,                          # seed for reproducibility
    output_dir='results',               # output directory to store epoch checkpoints
    num_train_epochs=6,                 # number of training epochs
    optim='adamw_torch',                # default optimizer as AdamW
    per_device_train_batch_size=4,     # 4 train batch size to speed up training
    per_device_eval_batch_size=4,      # 4 eval batch size to speed up evaluation
    evaluation_strategy='epoch',        # set evaluation strategy to each epoch instead of default 500 steps
    save_strategy='epoch',              # set saving of model strategy to each epoch instead of default 500 steps
    load_best_model_at_end=True,        # load the best model with lowest validation loss
    report_to='none',                   # suppress third-party logging
    learning_rate = 4e-5,               # learning rate
    weight_decay=1e-6,
)

In [56]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [57]:
import gc

gc.collect()

torch.cuda.empty_cache()

In [58]:
#Train Model
trainer.train()

Epoch,Training Loss,Validation Loss,Overall Accuracy,Samples Accuracy,Overall F1,Samples F1
1,0.4422,0.678397,0.568,0.739733,0.476346,0.573333
2,0.4324,0.678732,0.5408,0.7216,0.513365,0.5408
3,0.3989,0.940006,0.5984,0.741333,0.528519,0.599467
4,0.2514,1.309781,0.5648,0.714133,0.491898,0.570133
5,0.1702,1.468725,0.5536,0.7072,0.49802,0.557867
6,0.0979,1.520784,0.5648,0.714133,0.506281,0.568


TrainOutput(global_step=3750, training_loss=0.2938818069458008, metrics={'train_runtime': 803.8419, 'train_samples_per_second': 18.645, 'train_steps_per_second': 4.665, 'total_flos': 476881763977332.0, 'train_loss': 0.2938818069458008, 'epoch': 6.0})

# Prepare a submission file

In [59]:
#create a classifier object
telecom_sentiment_classifier = pipeline(task='text-classification', model=model, tokenizer=tokenizer,
                                                device=torch.cuda.current_device(), top_k=None)

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [60]:
#load test data
test = pd.read_csv("/content/drive/MyDrive/submission/datasets/test.csv")

In [61]:
test_df = clean_text(test, 'comment')

In [62]:
predictions = telecom_sentiment_classifier(test_df['comment'].tolist())

In [63]:
#create new columns to test dataset
predicted_labels = [prediction[0]['label'] for prediction in predictions]
confidence_scores = [prediction[0]['score'] for prediction in predictions]
test_df['predicted_label'] = predicted_labels
test_df['confidence_score'] = confidence_scores

In [64]:
#dropping unneccesary columns and rename the sentiment column
test_df.drop(columns=['comment', 'confidence_score'], inplace=True)
test_df.rename(columns={'predicted_label': 'sentiment'}, inplace=True)

In [65]:
#Return the values in 'sentiment' column to numerals
mapping = {"positive" : 1, "negative": -1, "neutral": 0}

test_df['sentiment'] = test_df['sentiment'].replace(mapping)

In [66]:
#check the new dataset
test_df.head()

Unnamed: 0,test_id,sentiment
0,53bb8dff-4a07-4aac-a6f2-ca552389652c,-1
1,a01e1c68-2f77-4073-87c5-d7e5d75c8a79,0
2,9a5e4b98-ac29-4138-bc28-fc878d5d8269,-1
3,9ef55f06-7748-4e02-85c7-fa3e46bb02dd,0
4,d93b4084-3000-4026-bae0-ecb65564dbbd,-1


In [None]:
#save the file for submission
test_df.to_csv("/content/drive/MyDrive/submission/datasets/submission.csv", index=False)