In [16]:
!pip install transformers datasets torch scikit-learn



In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset

data=pd.read_csv('IMDB Dataset.csv')


In [18]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [19]:
data['label']=data['sentiment'].map({'positive':1, 'negative':0})

In [20]:
train_df, test_df=train_test_split(data, test_size=0.2, random_state=42)

In [21]:
train_dataset=Dataset.from_pandas(train_df[['review', 'label']])
test_dataset=Dataset.from_pandas(test_df[['review', 'label']])

In [22]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
tokenizer=BertTokenizer.from_pretrained('bert-base-uncased')


# Tokenizing the Data

def tokenize_data(examples):
  return tokenizer(examples['review'], padding="max_length", truncation=True)


train_dataset = train_dataset.map(tokenize_data, batched=True)
test_dataset = test_dataset.map(tokenize_data, batched=True)




tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [23]:
# Loading the PreTrained BERT model for Sentiment Classification

model=BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Fine tuning the pretrained BERT model on the IMDB Reviews Dataset

training_args=TrainingArguments(
    output_dir='./finetuned_bert',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='epoch'

)

trainer=Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)


trainer.train()



Epoch,Training Loss,Validation Loss


In [None]:
# Evaluating the Fine Tuned Model

evaluation_result=trainer.evaluate()

print("Loss: ", evaluation_result['eval_loss'])

print("Accuracy: ",evaluation_result['eval_accuracy'])


In [None]:
# Performing Inference on the Fined Tuned BERT Model for Sentiment Analysis

from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.nn.functional import softmax

tokenizer=BertTokenizer.from_pretrained('./finetuned_bert')
model=BertForSequenceClassification.from_pretrained('./finetuned_bert')

input_text="I absolutely loved the movie! It was fantastic and the acting was superb."
inputs=tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=128)

with torch.no_grad():
  outputs=model(**inputs)

logits=outputs.logits
probabilities=softmax(logits, dim=-1)

predicted_class=torch.argmax(probabilities, dim=-1)

sentiment_labels=['Positive', 'Negative']

predicted_sentiment=sentiment_labels[predicted_class.item()]

print("Predicted Sentiment: ", predicted_sentiment)


