# Data Representation


In [8]:
from sympy.core.random import random
from functions_variables import *

In [9]:
# LOAD THE DATASET
path = '../data/preprocessed/'
files = {name: f'{path}{name}.csv' for name in set_names}
dataset = load_dataset('csv', data_files=files)

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating unsupervised split: 0 examples [00:00, ? examples/s]

In [10]:
# Get sample review data
limit = 1  # 25000 for the full dataset
review = dataset["train"].shuffle(seed=42).select(range(limit))
#Preview the dataset
print('Text:\n\t', review['text'][0])
print('Label:', review['label'][0])
# Check the labels distribution - sentiment polarity
print('Train:', set(dataset['train']['label']))
print('Test:', set(dataset['test']['label']))
print('Unsupervised:', set(dataset['unsupervised']['label']))

Text:
	 there is no relation at all between fortier and profiler but the fact that both are police series about violent crimes  profiler looks crispy  fortier looks classic  profiler plots are quite simple  fortier s plot are far more complicated    fortier looks more like prime suspect  if we have to spot similarities    the main character is weak and weirdo  but have  clairvoyance   people like to compare  to judge  to evaluate  how about just enjoying  funny thing too  people writing fortier looks american but  on the other hand  arguing they prefer american series        maybe it s the language  or the spirit  but i think this series is more english than american  by the way  the actors are really good and funny  the acting is not superficial at all   
Label: 1
Train: {0, 1}
Test: {0, 1}
Unsupervised: {-1}


### Load the Model and Tokenizer

In [11]:
# Load the tokenizer and model
# model_name = "distilbert/distilbert-base-uncased-finetuned-sst-2-english"
model_name = 'aychang/roberta-base-imdb'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

### Use a Prebuilt Sentiment Analysis Pipeline

In [12]:
# Get the dataset
path = '../data/preprocessed/'
files = {name: f'{path}{name}.csv' for name in set_names}
dataset = load_dataset('csv', data_files=files)
limit = 100  # 25000 for the full dataset
# reviews = dataset['train']['text'][:limit]
# labels = dataset['train']['label'][:limit]
train = dataset["train"].shuffle(seed=42).select(range(limit))
reviews = train['text']
labels = train['label']
test = dataset["test"].shuffle(seed=42).select(range(limit))
unsupervised = dataset["unsupervised"].shuffle(seed=42).select(range(limit))
# print(type(reviews[0]), type(labels), len(reviews[0]))
# print(reviews.shape, labels.shape)

In [13]:
# Initialize the sentiment analysis pipeline
sentiment_pipeline = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer, truncation=True, device=device)
# Perform sentiment analysis
results = sentiment_pipeline(reviews)

for text, label, result in zip(reviews, labels, results):
    text = text.split(' ', 50)
    text = ' '.join(text[:50])
    print(f"\nText:\n\t {text}...")
    print(f"Label: {label}")
    print(f"Sentiment:\n\t {result['label']}, Confidence: {result['score']:.4f}")


Device set to use mps:0



Text:
	 there is no relation at all between fortier and profiler but the fact that both are police series about violent crimes  profiler looks crispy  fortier looks classic  profiler plots are quite simple  fortier s plot are far more complicated    fortier looks more like...
Label: 1
Sentiment:
	 pos, Confidence: 0.9991

Text:
	 this movie is a great  the plot is very true to the book which is a classic written by mark twain  the movie starts of with a scene where hank sings a song with a bunch of kids called  when you stub your toe on the moon...
Label: 1
Sentiment:
	 pos, Confidence: 0.9991

Text:
	 george p  cosmatos   rambo  first blood part ii  is pure wish fulfillment  the united states clearly didn t win the war in vietnam  they caused damage to this country beyond the imaginable and this movie continues the fairy story of the oh so...
Label: 0
Sentiment:
	 neg, Confidence: 0.9908

Text:
	 in the process of trying to establish the audiences  empathy with jake roedel  tobey mag

### Use the Model Directly (Optional)

In [14]:
# Load the pre-trained DistilBERT model and tokenizer
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name)
# Ensure the model is moved to the MPS device if available
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")  # MPS,CUDA or CPU
model = model.to(device)  # Move model to MPS or CPU
import random

# review = random.choice(reviews)
review = dataset['train'].shuffle(seed=42).select(range(1))
# Tokenize the input
inputs = tokenizer(review['text'][0], return_tensors="pt")
# print(inputs)
# Move inputs to the same device as the model
inputs = {key: value.to(device) for key, value in inputs.items()}
# Perform inference
outputs = model(**inputs)
# Get logits and compute probabilities
logits = outputs.logits
probabilities = torch.softmax(logits, dim=1)
# Decode predictions
label_map = {0: "NEGATIVE", 1: "POSITIVE"}  # Label mapping for SST-2
predicted_label = torch.argmax(probabilities).item()
predicted_probability = probabilities[0, predicted_label].item()
print(f"Text: {review['text'][0]}")
print(f"True Sentiment: {review['label'][0]}")
print(f"Predicted Sentiment: {label_map[predicted_label]}")
print(f"Confidence: {predicted_probability:.4f}")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Text: there is no relation at all between fortier and profiler but the fact that both are police series about violent crimes  profiler looks crispy  fortier looks classic  profiler plots are quite simple  fortier s plot are far more complicated    fortier looks more like prime suspect  if we have to spot similarities    the main character is weak and weirdo  but have  clairvoyance   people like to compare  to judge  to evaluate  how about just enjoying  funny thing too  people writing fortier looks american but  on the other hand  arguing they prefer american series        maybe it s the language  or the spirit  but i think this series is more english than american  by the way  the actors are really good and funny  the acting is not superficial at all   
True Sentiment: 1
Predicted Sentiment: NEGATIVE
Confidence: 0.5158


### Batch Processing