# **Loading Libraries**

In [6]:
# Package Installation
!pip install transformers[torch] -q
!pip install accelerate -U -q
!pip install emoji -q

In [7]:
!pip install scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.5.0-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Using cached scipy-1.13.1-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.5.0-cp312-cp312-win_amd64.whl (10.9 MB)
Using cached joblib-1.4.2-py3-none-any.whl (301 kB)
Downloading scipy-1.13.1-cp312-cp312-win_amd64.whl (45.9 MB)
   ---------------------------------------- 0.0/45.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/45.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/45.9 MB 435.7 kB/s eta 0:01:46
   ---------------------------------------- 0.1/45.9 MB 1.1 MB/s eta 0:00:41
   ---------------------------------------- 0.2/45.9 MB 1.3 MB/s eta 0:00:35
 

In [8]:
# Standard Library Imports
import re
import string

# Third-party Library Imports
import numpy as np
import torch
import pandas as pd
import emoji
import sklearn

from transformers import Trainer, TrainingArguments
from transformers import pipeline
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-xlm-roberta-base-sentiment-multilingual")
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-xlm-roberta-base-sentiment-multilingual")

tokenizer_config.json: 100%|██████████| 443/443 [00:00<?, ?B/s] 
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
sentencepiece.bpe.model: 100%|██████████| 5.07M/5.07M [00:03<00:00, 1.56MB/s]
tokenizer.json: 100%|██████████| 17.1M/17.1M [00:05<00:00, 3.08MB/s]
special_tokens_map.json: 100%|██████████| 280/280 [00:00<?, ?B/s] 
config.json: 100%|██████████| 982/982 [00:00<?, ?B/s] 
pytorch_model.bin: 100%|██████████| 1.11G/1.11G [09:34<00:00, 1.94MB/s]


# **Data Preparation**

In [10]:
# Dataset is loaded into a dataframe
df = pd.read_csv(r"C:\Users\Shelender Kumar\Downloads\train_data.csv")
test_df = pd.read_csv(r"C:\Users\Shelender Kumar\Downloads\test_data.csv")

#Information about the structure of both train and test dataset is printed
df.info()
test_df.info()

SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: truncated \UXXXXXXXX escape (1217397119.py, line 2)

# **Preprocessing**

### **Remove Empty Tweets**

In [None]:
# blank tweets (tweets with no comment_description) are identified
blank_tweets = df[df['comment_description'].isnull() | (df['comment_description'] == '')]

# The rows and indices of the blank tweets are printed
# This provides insight into the location and content of blank tweets within the dataset.
for index, row in blank_tweets.iterrows():
    print("Index:", index)
    print("Row:", row)
    print()

In [None]:
"""
These blank tweets are removed here. We remove rows with empty values
and empty string in the 'comment_description' column. Afterwards the
index is reset.
"""
# Rows with empty or missing values in the 'comment_description' column are removed
# This step ensures that only tweets with valid descriptions are retained in the dataset.
df = df.dropna(subset=['comment_description'])
df = df[df['comment_description'] != '']

# The index is reset to maintain continuity after removing rows
df = df.reset_index(drop=True)
print(df.count())

### **Remove Duplicate Tweets**

In [None]:
"""
This function helps us in finding duplicate tweets in the training set.
It shortens each tweets to its first 100 words. If tweet is in string format,
then it returns first 100 chracters otherwise returns NaN.
"""

def duplicate_tweets(tweet):
    if isinstance(tweet, str):
        shortened_tweet = tweet[:100]
    else:
        shortened_tweet = np.nan
    return shortened_tweet

# First 100 characters of each row of comment_description are saved in a new column
df['first100charactersoftweets'] = df['comment_description'].apply(duplicate_tweets)

# Duplicates are dropped based on the first 100 characters and newly created column is deleted
df = df.drop_duplicates(subset='first100charactersoftweets', keep="first")
df = df.drop(columns=['first100charactersoftweets'])

print(df.count())

# **Removing URL, Handles, Punctuation**

In [None]:
"""
In this section, we do the basic preprocessing of the individual tweets.
URLs, mentions, handles, punctuation are removed from the tweet text.
This step is essential for cleaning the text data before further analysis or modeling.
"""

def preprocess_tweet(text):
    text = emoji.demojize(text)
    text = re.sub(r'https?://\S+', '', text) # Remove URLs
    text = re.sub(r'@\w+', '', text) # Remove mentions/handles
    text = text.translate(str.maketrans('', '', string.punctuation)) # Remove punctuation
    text = text.lower() #Lowercase the text
    return text

In [None]:
#Visualization of dataset before preprocessing
df.head(10)

In [None]:
# preprocess_tweet function is applied to each row of the dataframe
# This ensures that each tweet's text is cleaned and ready for further processing.
df['comment_description'] = df['comment_description'].apply(preprocess_tweet)

In [None]:
df.head(10)

# **Language Detection and Translation**

In this cell, I have provided some different efficient approaches to tackle this task which can help improve the model in future.
1. We can detect the language of input text using **langdetect** library. If the detected language is Arabic, we translate it to english using **Google Translate API** or **googletrans** library. This way, all the elements of dataset will be in one language and we can train BERT model instead of mBERT.
2. We can make customized preprocessing based on the detected langauge. AraBERT tokenizer can be used for Arabic text instead of mBERT tokenizer.

# **Model Fine-tuning**

In [None]:
# Dataset is split into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2)

# The count of samples in the training set is printed.
print("Training Set Count:")
print(train_df.count())

# The count of samples in the validation set is printed.
print("\n\nValidation Set Count:")
print(val_df.count())

In [None]:
# Custom dataset class for sentiment analysis using BERT tokenizer.
class SentimentDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer

    def __getitem__(self, index):
        text = str(self.data.comment_description.iloc[index])
        # The text is tokenized using BERT tokenizer
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=256,
            padding='max_length',
            return_token_type_ids=True,
            return_attention_mask=True,
            truncation=True
        )

        label = self.data.sentiment.iloc[index]

        if label == 'Positive':
          sentiment_id = 1
        else:
          sentiment_id = 0

        return {
            'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
            'labels': torch.tensor(sentiment_id, dtype=torch.long)
            #'targets': torch.tensor(self.data.sentiment.iloc[index])
        }

    def __len__(self):
        return self.len



In [None]:
# DataFrame is converted to Dataset for training and validation
train_dataset = SentimentDataset(train_df, tokenizer)
val_dataset = SentimentDataset(val_df, tokenizer)

# Lengths of the training and validation datasets are printed
print(len(train_dataset))
print(len(val_dataset))

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir = '/results',          # output directory
    num_train_epochs = 10,              # total number of training epochs
    per_device_train_batch_size = 32,  # batch size per device during training
    per_device_eval_batch_size = 32,   # batch size for evaluation
    weight_decay = 0.01,               # strength of weight decay
    logging_dir = '/logs',            # directory for storing logs
    logging_steps = 10,
    #evaluation_strategy = 'epoch'
)

# Trainer object for training the model
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = val_dataset
)

# Model is trained
trainer.train()

# **Model Saving**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Fine-tuned model and tokenizer are saved to the specified directory
model_path = "/content/drive/MyDrive/Rewaa"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

# **Inference**

In [None]:
# The saved fine-tuned model is loaded
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [None]:
# A mapping is defined from sentiment labels to human-readable names
SENTIMENT_LABELS = {0: 'Negative', 1: 'Positive'}

def predict_sentiment(text):
    # The input text is tokenized
    inputs = tokenizer.encode_plus(
        text,
        None,
        add_special_tokens=True,
        max_length=256,
        padding='max_length',
        return_token_type_ids=True,
        return_attention_mask=True,
        truncation=True
    )

    # Input is converted to PyTorch tensors
    input_ids = torch.tensor(inputs['input_ids'], dtype=torch.long).unsqueeze(0)
    attention_mask = torch.tensor(inputs['attention_mask'], dtype=torch.long).unsqueeze(0)

    # Forward pass through the model
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    predicted_label_id = torch.argmax(outputs.logits).item()

    # Predicted label ID is mapped to sentiment class label
    predicted_sentiment = SENTIMENT_LABELS[predicted_label_id]
    return predicted_sentiment

In [None]:
# Sentiment of a new text is predicted here
input_text = "You are very bad"
preprocessed_text = preprocess_tweet(input_text)
predicted_sentiment = predict_sentiment(preprocessed_text)
print("Predicted Sentiment:", predicted_sentiment)