In [1]:
import pandas as pd
import numpy as np
import nltk
import spacy
import re
import string
from tensorflow.keras.preprocessing.text import one_hot,Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense,Embedding
from tensorflow.keras.models import Sequential
from gensim.models import KeyedVectors
import tensorflow as tf
from torch import Tensor
from scipy.special import softmax
import torch
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv("/content/IMDB Dataset.csv",engine = "python", on_bad_lines = "skip")

In [3]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
15803,"Usually, I know after the first minute of a mo...",negative
15804,"A great film this, and a shame that it will re...",positive
15805,*May contain spoilers* *May contain spoilers*<...,positive
15806,I saw this black and white comedy noir yesterd...,positive


# Text Preprocessing

In [7]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = text.strip()  # Remove leading and trailing whitespaces
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

df['cleaned_review'] = df['review'].apply(clean_text)



In [6]:
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

# Extracting the corpus after text Cleaning

In [8]:
corpus = df["cleaned_review"].to_list()
sentiment_label = df["sentiment"].to_list()

In [9]:
Y=[]
for item in sentiment_label:
    if item == "positive":
        Y.append(2)
    elif item == "negative":
        Y.append(0)
    else:
        Y.append(1)
Y=np.array(Y)

# Tokenization

In [10]:
from nltk.tokenize import word_tokenize


batch_size = 1000
tokens = []

for i in range(0, len(corpus), batch_size):
    batch = corpus[i:i + batch_size]
    tokenized_batch = [word_tokenize(tweet) for tweet in batch]
    tokens.extend(tokenized_batch)
    print(f"Processed {i + batch_size} reviews")



Processed 1000 reviews
Processed 2000 reviews
Processed 3000 reviews
Processed 4000 reviews
Processed 5000 reviews
Processed 6000 reviews
Processed 7000 reviews
Processed 8000 reviews
Processed 9000 reviews
Processed 10000 reviews
Processed 11000 reviews
Processed 12000 reviews
Processed 13000 reviews
Processed 14000 reviews
Processed 15000 reviews
Processed 16000 reviews


# Converting into numbers to feed the model

In [11]:
joined_sentence = [" ".join(sentence) for sentence in tokens]
vocab_size = 10000
tokenizer = Tokenizer(num_words = vocab_size, oov_token = "<OOV>")
tokenizer.fit_on_texts(joined_sentence)


In [12]:
joined_sentence[0]

'one reviewers mentioned watching oz episode youll hooked right exactly happened mebr br first thing struck oz brutality unflinching scenes violence set right word go trust show faint hearted timid show pulls punches regards drugs sex violence hardcore classic use wordbr br called oz nickname given oswald maximum security state penitentary focuses mainly emerald city experimental section prison cells glass fronts face inwards privacy high agenda em city home manyaryans muslims gangstas latinos christians italians irish moreso scuffles death stares dodgy dealings shady agreements never far awaybr br would say main appeal show due fact goes shows wouldnt dare forget pretty pictures painted mainstream audiences forget charm forget romanceoz doesnt mess around first episode ever saw struck nasty surreal couldnt say ready watched developed taste oz got accustomed high levels graphic violence violence injustice crooked guards wholl sold nickel inmates wholl kill order get away well mannered 

In [13]:
sequences = tokenizer.texts_to_sequences(joined_sentence)

# Padding the sequences

In [14]:

max_length = max(len(sentence) for sentence in tokens)
padded_sentences = pad_sequences(sequences, padding = "pre", maxlen = 512, truncating = "pre")

In [15]:
X=np.array(padded_sentences)

In [16]:
X

array([[   0,    0,    0, ..., 1037, 4120,  379],
       [   0,    0,    0, ..., 1863,   14,  119],
       [   0,    0,    0, ...,   54,   12,  217],
       ...,
       [   0,    0,    0, ...,    4, 1462,    6],
       [   0,    0,    0, ..., 5719,    1,   72],
       [   0,    0,    0, ...,   43, 1996,  518]], dtype=int32)

# Preparing to feed the huggingface model

In [17]:
attention_masks = np.where(X != 0,1,0)
labels = Y

In [18]:
print(X.shape, attention_masks.shape, labels.shape)

(15808, 512) (15808, 512) (15808,)


In [21]:
from datasets import Dataset

dataset = Dataset.from_dict({
    "input_ids": X.tolist(),
    "attention_mask": attention_masks.tolist(),
    "label": labels.tolist()

})

# Splitting the dataset

In [22]:
split = dataset.train_test_split(test_size = 0.3)
validation_data = split["test"].train_test_split(test_size = 0.5)
X_train = split["train"]
X_cv = validation_data["train"]
X_test = validation_data["test"]

In [23]:
print(f"Train size: {len(X_train)}, Validation size: {len(X_cv)}, Test size: {len(X_test)}")

Train size: 11065, Validation size: 2371, Test size: 2372


# Model Training

In [24]:
# Load model directly
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")


for param in model.roberta.embeddings.parameters():
    param.requires_grad = False

# Freeze the first 6 encoder layers
for layer in model.roberta.encoder.layer[:11]:  # Freeze the first 6 layers
    for param in layer.parameters():
        param.requires_grad = False

# Verify which layers are frozen
for name, param in model.named_parameters():
    print(name, param.requires_grad)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


roberta.embeddings.word_embeddings.weight False
roberta.embeddings.position_embeddings.weight False
roberta.embeddings.token_type_embeddings.weight False
roberta.embeddings.LayerNorm.weight False
roberta.embeddings.LayerNorm.bias False
roberta.encoder.layer.0.attention.self.query.weight False
roberta.encoder.layer.0.attention.self.query.bias False
roberta.encoder.layer.0.attention.self.key.weight False
roberta.encoder.layer.0.attention.self.key.bias False
roberta.encoder.layer.0.attention.self.value.weight False
roberta.encoder.layer.0.attention.self.value.bias False
roberta.encoder.layer.0.attention.output.dense.weight False
roberta.encoder.layer.0.attention.output.dense.bias False
roberta.encoder.layer.0.attention.output.LayerNorm.weight False
roberta.encoder.layer.0.attention.output.LayerNorm.bias False
roberta.encoder.layer.0.intermediate.dense.weight False
roberta.encoder.layer.0.intermediate.dense.bias False
roberta.encoder.layer.0.output.dense.weight False
roberta.encoder.layer.

In [25]:
from transformers import Trainer, TrainingArguments

# Define Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",  # Evaluates on validation set at each epoch
    per_device_train_batch_size=100,
    per_device_eval_batch_size=100,
    num_train_epochs=3,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir="./logs"
)

# Initialize Trainer with train, validation, and test sets
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=X_train,
    eval_dataset=X_cv  # Used for validation during training
)

# Train the model
trainer.train()

# Evaluate on Test Set
results = trainer.evaluate(X_test)
print(results)


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mprachit678bhujel[0m ([33mprachit678bhujel-purwanchal-campus[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,No log,0.6858
2,No log,0.690753
3,No log,0.651082


{'eval_loss': 0.6354750990867615, 'eval_runtime': 65.6377, 'eval_samples_per_second': 36.138, 'eval_steps_per_second': 0.366, 'epoch': 3.0}


In [30]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-sentiment-latest")


# Load model directly
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


In [26]:
model.save_pretrained("./fine-tuned-twitter-roberta")


In [32]:
from transformers import pipeline

pipe = pipeline("sentiment-analysis", model = model, model_path = "/content/fine-tuned-twitter-roberta",tokenizer = tokenizer)

Device set to use cuda:0


In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline

tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")


model_path = "/content/fine-tuned-twitter-roberta"
model = AutoModelForSequenceClassification.from_pretrained(model_path)


# Create a sentiment analysis pipeline
sentiment_analyzer = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cuda:0


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [35]:
text = joined_sentence[0]

inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
inputs = {key: value.to("cuda") for key, value in inputs.items()}


output = model(**inputs)
scores = Tensor.cpu(output[0][0]).detach().numpy()
scores = softmax(scores)

In [36]:
np.argmax(scores)

2

In [39]:
def inference(sentence):
  inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512)
  inputs = {key: value.to("cuda") for key, value in inputs.items()}
  output = model(**inputs)
  scores = Tensor.cpu(output[0][0]).detach().numpy()
  scores = softmax(scores)

  return np.argmax(scores)


# Training Error

In [40]:
prediction = np.zeros(len(X_train))
for i in range(len(X_train)):
    if i % 1000 == 0:
      print(f"I is {i}")
    prediction[i] = inference(joined_sentence[i])



I is 0
I is 1000
I is 2000
I is 3000
I is 4000
I is 5000
I is 6000
I is 7000
I is 8000
I is 9000
I is 10000
I is 11000


In [41]:
accuracy= accuracy_score(prediction, Y[:len(X_train)])
print("Training accuracy is ", accuracy)

Training accuracy is  0.6922729326705829




# Cross Validation error


In [50]:
prediction = np.zeros(2371)
for i in range(2371):
    if i % 1000 == 0:
      print(f"I is {i}")
    prediction[i] = inference(joined_sentence[len(X_cv)+i])


I is 0
I is 1000
I is 2000


In [51]:
accuracy= accuracy_score(prediction, Y[len(X_train):len(X_train)+len(X_cv)])
print("Validation accuracy is ", accuracy)

Validation accuracy is  0.48671446646984395


# Testing error

In [46]:
prediction = np.zeros(2372)
for i in range(2372):
    if i % 1000 == 0:
      print(f"I is {i}")
    prediction[i] = inference(joined_sentence[len(X_test)])

I is 0
I is 1000
I is 2000


In [49]:
accuracy= accuracy_score(prediction, Y[len(X_train)+len(X_cv):])
print("Test accuracy is ", accuracy)

Test accuracy is  0.48018549747048905
