In [2]:
import pandas as pd
from datasets import Dataset
from transformers import (AutoTokenizer,AutoModelForSequenceClassification,DataCollatorWithPadding,Trainer,TrainingArguments)
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,roc_auc_score
import torch
from sklearn.model_selection import train_test_split

In [3]:
df=pd.read_csv(r"C:\Users\sande\OneDrive\Documents\Priya\Data_science_Dataset\IMDB_Movie_Review\IMDB Dataset.csv",encoding='utf-8-sig')

In [3]:
df.columns

Index(['review', 'sentiment'], dtype='object')

In [4]:
df['label']=df['sentiment'].map({'positive':1,'negative':0})
df=df[['review','label']]#Keep only relevant columns

In [5]:
df=df.sample(5000,random_state=42)

#Convert pandas dataframe to HUgging Face dataset

#Split dataset
train_df,test_df=train_test_split(df,test_size=0.2,random_state=42,stratify=df['label'])
train_dataset=Dataset.from_pandas(train_df)
test_dataset=Dataset.from_pandas(test_df)


In [6]:
#Tokenizer
tokenizer=AutoTokenizer.from_pretrained("distilbert-base-uncased")

#Preprocess
def preprocess(batch):
    return tokenizer(batch["review"],truncation=True,padding=False,max_length=256)

tokenized_train=train_dataset.map(preprocess,batched=True)
tokenized_test=test_dataset.map(preprocess,batched=True)

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [7]:
#During training, for each batch only:Finds the longest sequence in the batch,Pads all other sequences to that length,
#Updates input_ids and attention_mask
#This is called dynamic padding → more efficient than padding every sample to a fixed max length.
data_collator=DataCollatorWithPadding(tokenizer=tokenizer)

In [8]:
model=AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased",num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
#This sets up Hugging Face’s TrainingArguments, which are configurations for training a transformer model.

training_args=TrainingArguments(output_dir="./results",eval_strategy="epoch",num_train_epochs=1,learning_rate=2e-5,
                               per_device_train_batch_size=8,per_device_eval_batch_size=8)

In [10]:

#creating a Trainer object, which is Hugging Face’s high-level API to train, evaluate, and save transformer models

trainer=Trainer(model=model,args=training_args,train_dataset=tokenized_train,eval_dataset=tokenized_test,
                tokenizer=tokenizer,data_collator=data_collator,)

In [11]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,0.3629,0.309551




TrainOutput(global_step=500, training_loss=0.36287255859375, metrics={'train_runtime': 5699.7869, 'train_samples_per_second': 0.702, 'train_steps_per_second': 0.088, 'total_flos': 264665722908480.0, 'train_loss': 0.36287255859375, 'epoch': 1.0})

In [12]:
metrics=trainer.evaluate()
print(metrics)

{'eval_loss': 0.3095510005950928, 'eval_runtime': 386.322, 'eval_samples_per_second': 2.589, 'eval_steps_per_second': 0.324, 'epoch': 1.0}


In [13]:
trainer.save_model("./IMDB_sentiment_model")
tokenizer.save_pretrained("./imdb_sentimnt_model")


('./imdb_sentimnt_model\\tokenizer_config.json',
 './imdb_sentimnt_model\\special_tokens_map.json',
 './imdb_sentimnt_model\\vocab.txt',
 './imdb_sentimnt_model\\added_tokens.json',
 './imdb_sentimnt_model\\tokenizer.json')

In [17]:
import numpy as np
from sklearn.metrics import accuracy_score,precision_recall_fscore_support

predictions=trainer.predict(tokenized_test) # This outputs scores of all classes

preds=np.argmax(predictions.predictions,axis=-1) # This selects the class with highest score
labels=predictions.label_ids  # Ground-truth (actual) labels from test dataset

acc=accuracy_score(labels,preds)
precision,recall,f1,_=precision_recall_fscore_support(labels,preds,average='weighted')

print("Accuracy=",acc)
print("Precision=",precision)
print("Recall=",recall)
print("f1 score",f1)

Accuracy= 0.883
Precision= 0.883164928435592
Recall= 0.883
f1 score 0.8829755381692792
