Colab notebook to load trained Roberta models along with its tokenizer and to run inference on new text data as well as getting their sentiment.

In [1]:
# Install packages

%pip install torch
%pip install transformers
%pip install datasets
%pip install transformers[torch]

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m39.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m58.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m70.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# mount the colab session to google drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
# Copy the zip file of models and weights to colab session and extract them.

!cp "/content/drive/MyDrive/University/NLP/ROBERTA/final_weights__ROBERTA-20240328T005456Z-001.zip" .
!unzip final_weights__ROBERTA-20240328T005456Z-001.zip

Archive:  final_weights__ROBERTA-20240328T005456Z-001.zip
  inflating: final_weights/tokenizer/tokenizer_config.json  
  inflating: final_weights/tokenizer/special_tokens_map.json  
  inflating: final_weights/tokenizer/merges.txt  
  inflating: final_weights/model/config.json  
  inflating: final_weights/tokenizer/vocab.json  
  inflating: final_weights/tokenizer/tokenizer.json  
  inflating: final_weights/model/model.safetensors  


In [3]:
# Disabe W&B to conserve resources
import os
os.environ["WANDB_DISABLED"] = "true"

In [44]:
# import packages

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from collections import Counter

from datasets import load_dataset
from datasets import load_metric

from wordcloud import WordCloud

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, DataCollatorWithPadding
from transformers import Trainer

In [5]:
# read the unlabelled dataset of tweets for sentiment analysis
csv_path = "/content/drive/MyDrive/University/NLP/ROBERTA/data/Test.csv"

df = pd.read_csv(csv_path)

In [7]:
df.shape

(5177, 2)

In [6]:
df.head()

Unnamed: 0,tweet_id,safe_text
0,00BHHHP1,<user> <user> ... &amp; 4 a vaccine given 2 he...
1,00UNMD0E,Students starting school without whooping coug...
2,01AXPTJF,"I'm kinda over every ep of <user> being ""rippe..."
3,01HOEQJW,How many innocent children die for lack of vac...
4,01JUKMAO,"CDC eyeing bird flu vaccine for humans, though..."


In [9]:
# initialize and load tokenizer model
tokenizer = AutoTokenizer.from_pretrained("/content/final_weights/tokenizer", num_labels=3)

In [10]:
# intialize and localize ROBERTA model
model = AutoModelForSequenceClassification.from_pretrained("/content/final_weights/model", num_labels=3)

In [43]:
# mapping used in model
# 0 -> negative sentiment
# 1 -> neutral sentiment
# 2 -> positive sentiment


# A proper function to interpret sentiment values
def map_sentiment(model_sentiment):
    if model_sentiment==0:
        return -1
    elif model_sentiment==1:
        return 0
    else:
        return 1


In [57]:
# function to predict sentiment for a given text
def predict_sentiment(text):
    # tokenizer operation
    tokenized_text = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

    # Perform the prediction
    with torch.no_grad():
        outputs = model(**tokenized_text)

    # Get the predicted class
    predicted_class = torch.argmax(outputs.logits).item()

    return predicted_class



In [60]:
# make predictions on all the tweets

Y = []

# drop null rows
df2 = df.copy().dropna()

# make the prediction
for row in tqdm(df2.iterrows()):
    # if row[0] <2020:
    #     continue
    # print(row[0])

    text=row[1]['safe_text']
    prediction = predict_sentiment(text)
    sentiment = map_sentiment(prediction)
    Y.append(sentiment)



5176it [11:35,  7.44it/s]


In [61]:
df2['sentiment'] = Y

In [62]:
df2.head()

Unnamed: 0,tweet_id,safe_text,sentiment
0,00BHHHP1,<user> <user> ... &amp; 4 a vaccine given 2 he...,1
1,00UNMD0E,Students starting school without whooping coug...,1
2,01AXPTJF,"I'm kinda over every ep of <user> being ""rippe...",0
3,01HOEQJW,How many innocent children die for lack of vac...,1
4,01JUKMAO,"CDC eyeing bird flu vaccine for humans, though...",0


In [63]:
# save the predictions in csv file

df2.to_csv("Sentiment_Predictions.csv", index=False)