# **Loading Libraries**

In [27]:
# Package Installation
!pip install transformers[torch] -q
!pip install accelerate -U -q
!pip install emoji -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/421.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.6/421.5 kB[0m [31m4.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m421.5/421.5 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [28]:
# Standard Library Imports
import re
import string

# Third-party Library Imports
import numpy as np
import torch
import pandas as pd
import emoji

from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from transformers import pipeline
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

# **Data Preparation**

In [39]:
# Dataset is loaded into a dataframe
df = pd.read_csv("/content/train_data.csv")
test_df = pd.read_csv("/content/test_data.csv")

#Information about the structure of both train and test dataset is printed
df.info()
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5500 entries, 0 to 5499
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   campaign_id          5500 non-null   object
 1   comment_id           5500 non-null   int64 
 2   comment_description  5497 non-null   object
 3   sentiment            5500 non-null   object
dtypes: int64(1), object(3)
memory usage: 172.0+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1187 entries, 0 to 1186
Data columns (total 3 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   campaign_id          1187 non-null   int64 
 1   comment_id           1187 non-null   int64 
 2   comment_description  1187 non-null   object
dtypes: int64(2), object(1)
memory usage: 27.9+ KB


# **Preprocessing**

### **Remove Empty Tweets**

In [40]:
# blank tweets (tweets with no comment_description) are identified
blank_tweets = df[df['comment_description'].isnull() | (df['comment_description'] == '')]

# The rows and indices of the blank tweets are printed
# This provides insight into the location and content of blank tweets within the dataset.
for index, row in blank_tweets.iterrows():
    print("Index:", index)
    print("Row:", row)
    print()

Index: 1090
Row: campaign_id                       2600_2
comment_id             17979891167587327
comment_description                  NaN
sentiment                       Positive
Name: 1090, dtype: object

Index: 5095
Row: campaign_id                         2333
comment_id             18001116857040003
comment_description                  NaN
sentiment                       Positive
Name: 5095, dtype: object

Index: 5284
Row: campaign_id                         2333
comment_id             17963438963655825
comment_description                  NaN
sentiment                       Positive
Name: 5284, dtype: object



In [41]:
"""
These blank tweets are removed here. We remove rows with empty values
and empty string in the 'comment_description' column. Afterwards the
index is reset.
"""
# Rows with empty or missing values in the 'comment_description' column are removed
# This step ensures that only tweets with valid descriptions are retained in the dataset.
df = df.dropna(subset=['comment_description'])
df = df[df['comment_description'] != '']

# The index is reset to maintain continuity after removing rows
df = df.reset_index(drop=True)
print(df.count())

campaign_id            5497
comment_id             5497
comment_description    5497
sentiment              5497
dtype: int64


### **Remove Duplicate Tweets**

In [42]:
"""
This function helps us in finding duplicate tweets in the training set.
It shortens each tweets to its first 100 words. If tweet is in string format,
then it returns first 100 chracters otherwise returns NaN.
"""

def duplicate_tweets(tweet):
    if isinstance(tweet, str):
        shortened_tweet = tweet[:100]
    else:
        shortened_tweet = np.nan
    return shortened_tweet

# First 100 characters of each row of comment_description are saved in a new column
df['first100charactersoftweets'] = df['comment_description'].apply(duplicate_tweets)

# Duplicates are dropped based on the first 100 characters and newly created column is deleted
df = df.drop_duplicates(subset='first100charactersoftweets', keep="first")
df = df.drop(columns=['first100charactersoftweets'])

print(df.count())

campaign_id            4265
comment_id             4265
comment_description    4265
sentiment              4265
dtype: int64


# **Removing URL, Handles, Punctuation**

In [43]:
"""
In this section, we do the basic preprocessing of the individual tweets.
URLs, mentions, handles, punctuation are removed from the tweet text.
This step is essential for cleaning the text data before further analysis or modeling.
"""

def preprocess_tweet(text):
    text = emoji.demojize(text)
    text = re.sub(r'https?://\S+', '', text) # Remove URLs
    text = re.sub(r'@\w+', '', text) # Remove mentions/handles
    text = text.translate(str.maketrans('', '', string.punctuation)) # Remove punctuation
    text = text.lower() #Lowercase the text
    return text

In [44]:
#Visualization of dataset before preprocessing
df.head(10)

Unnamed: 0,campaign_id,comment_id,comment_description,sentiment
0,2212,17908351952371091,لخسارة الوزن الزائد والكرش بمدة قياسية مع عدم ...,Negative
1,2217,17935944230085744,🔥🔥🔥,Positive
2,2215S,17899518356507020,This is so good😍 would be great it If you add ...,Negative
3,2214,18014766136389857,😍,Positive
4,2203,17924318627206870,طبق رائع ومميز تبارك الرحمن تسلم ايدك يارب 😍,Positive
5,2217,18032092822349781,@zainab.aleqabi مفعل هذا التطبيق بالعراق ؟,Positive
6,2215S,18122714995274518,@muhamyat_alfreej,Positive
7,2215S,17947316074891383,ابدعتي ام شريف,Positive
8,2215C,18201796876154595,شكلو و قوامو روعة سلم ايديكي❤️❤️,Positive
9,2211,17955037840709480,❤️❤️❤️❤️❤️❤️❤️,Positive


In [37]:
# preprocess_tweet function is applied to each row of the dataframe
# This ensures that each tweet's text is cleaned and ready for further processing.
df['comment_description'] = df['comment_description'].apply(preprocess_tweet)

In [36]:
df.head(10)

Unnamed: 0,campaign_id,comment_id,comment_description,sentiment
0,2212,17908351952371091,لخسارة الوزن الزائد والكرش بمدة قياسية مع عدم ...,Negative
1,2217,17935944230085744,firefirefire,Positive
2,2215S,17899518356507020,this is so goodsmilingfacewithhearteyes would ...,Negative
3,2214,18014766136389857,smilingfacewithhearteyes,Positive
4,2203,17924318627206870,طبق رائع ومميز تبارك الرحمن تسلم ايدك يارب smi...,Positive
5,2217,18032092822349781,aleqabi مفعل هذا التطبيق بالعراق ؟,Positive
6,2215S,18122714995274518,,Positive
7,2215S,17947316074891383,ابدعتي ام شريف,Positive
8,2215C,18201796876154595,شكلو و قوامو روعة سلم ايديكيredheartredheart,Positive
9,2211,17955037840709480,redheartredheartredheartredheartredheartredhea...,Positive


# **Language Detection and Translation**

In this cell, I have provided some different efficient approaches to tackle this task which can help improve the model in future.
1. We can detect the language of input text using **langdetect** library. If the detected language is Arabic, we translate it to english using **Google Translate API** or **googletrans** library. This way, all the elements of dataset will be in one language and we can train BERT model instead of mBERT.
2. We can make customized preprocessing based on the detected langauge. AraBERT tokenizer can be used for Arabic text instead of mBERT tokenizer.

# **Model Fine-tuning**

In [9]:
# Dataset is split into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2)

# The count of samples in the training set is printed.
print("Training Set Count:")
print(train_df.count())

# The count of samples in the validation set is printed.
print("\n\nValidation Set Count:")
print(val_df.count())

Training Set Count:
campaign_id            3412
comment_id             3412
comment_description    3412
sentiment              3412
dtype: int64


Validation Set Count:
campaign_id            853
comment_id             853
comment_description    853
sentiment              853
dtype: int64


In [14]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Custom dataset class for sentiment analysis using BERT tokenizer.
class SentimentDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer

    def __getitem__(self, index):
        text = str(self.data.comment_description.iloc[index])
        # The text is tokenized using BERT tokenizer
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=256,
            padding='max_length',
            return_token_type_ids=True,
            return_attention_mask=True,
            truncation=True
        )

        label = self.data.sentiment.iloc[index]

        if label == 'Positive':
          sentiment_id = 1
        else:
          sentiment_id = 0

        return {
            'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
            'labels': torch.tensor(sentiment_id, dtype=torch.long)
            #'targets': torch.tensor(self.data.sentiment.iloc[index])
        }

    def __len__(self):
        return self.len



In [15]:
# DataFrame is converted to Dataset for training and validation
train_dataset = SentimentDataset(train_df, tokenizer)
val_dataset = SentimentDataset(val_df, tokenizer)

# Lengths of the training and validation datasets are printed
print(len(train_dataset))
print(len(val_dataset))

3412
853


In [26]:
# Model is defined here
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2)

# Training arguments
training_args = TrainingArguments(
    output_dir = '/results',          # output directory
    num_train_epochs = 10,              # total number of training epochs
    per_device_train_batch_size = 32,  # batch size per device during training
    per_device_eval_batch_size = 32,   # batch size for evaluation
    weight_decay = 0.01,               # strength of weight decay
    logging_dir = '/logs',            # directory for storing logs
    logging_steps = 10,
    #evaluation_strategy = 'epoch'
)

# Trainer object for training the model
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = val_dataset
)

# Model is trained
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
10,0.5458
20,0.5365
30,0.5751
40,0.4909
50,0.5183
60,0.511
70,0.498
80,0.5092
90,0.4811
100,0.4204


Checkpoint destination directory /results/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory /results/checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=1070, training_loss=0.2153351321398655, metrics={'train_runtime': 1420.6057, 'train_samples_per_second': 24.018, 'train_steps_per_second': 0.753, 'total_flos': 4488674604441600.0, 'train_loss': 0.2153351321398655, 'epoch': 10.0})

# **Model Saving**

In [49]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [50]:
# Fine-tuned model and tokenizer are saved to the specified directory
model_path = "/content/drive/MyDrive/Rewaa"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('/content/drive/MyDrive/Rewaa/tokenizer_config.json',
 '/content/drive/MyDrive/Rewaa/special_tokens_map.json',
 '/content/drive/MyDrive/Rewaa/vocab.txt',
 '/content/drive/MyDrive/Rewaa/added_tokens.json')

# **Inference**

In [51]:
# The saved fine-tuned model is loaded
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)

In [52]:
# A mapping is defined from sentiment labels to human-readable names
SENTIMENT_LABELS = {0: 'Negative', 1: 'Positive'}

def predict_sentiment(text):
    # The input text is tokenized
    inputs = tokenizer.encode_plus(
        text,
        None,
        add_special_tokens=True,
        max_length=256,
        padding='max_length',
        return_token_type_ids=True,
        return_attention_mask=True,
        truncation=True
    )

    # Input is converted to PyTorch tensors
    input_ids = torch.tensor(inputs['input_ids'], dtype=torch.long).unsqueeze(0)
    attention_mask = torch.tensor(inputs['attention_mask'], dtype=torch.long).unsqueeze(0)

    # Forward pass through the model
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    predicted_label_id = torch.argmax(outputs.logits).item()

    # Predicted label ID is mapped to sentiment class label
    predicted_sentiment = SENTIMENT_LABELS[predicted_label_id]
    return predicted_sentiment

In [53]:
# Sentiment of a new text is predicted here
input_text = "I like this food very much."
preprocessed_text = preprocess_tweet(input_text)
predicted_sentiment = predict_sentiment(preprocessed_text)
print("Predicted Sentiment:", predicted_sentiment)

Predicted Sentiment: Positive
