<a href="https://colab.research.google.com/github/sramv/LLM-Handbook/blob/main/Task_Specific_Model_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### The Notebook demonstrates the different methods to use Language Models for Classification. Dataset Used: Rotten Tomatoes dataset

# Method 1: Classification Task using Task Specific Model

In [1]:
from transformers import pipeline

In [2]:
#path to HuggingFace Model
model_path = f"cardiffnlp/twitter-roberta-base-sentiment-latest"

In [3]:
#load model into pipeline
pipe = pipeline("sentiment-analysis", model=model_path, tokenizer=model_path, top_k=None, device=-1)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you e

In [4]:
import numpy as np
from tqdm import tqdm
from transformers.pipelines.pt_utils import KeyDataset

In [5]:
!pip install datasets



In [6]:
from datasets import load_dataset

#load our data

data = load_dataset("rotten_tomatoes")
data

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
})

In [7]:
data["train"][0, -1]

{'text': ['the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .',
  'things really get weird , though not particularly scary : the movie is all portent and no content .'],
 'label': [1, 0]}

In [8]:
#Run Inference

y_pred = []
for output in tqdm(pipe(KeyDataset(data["test"], "text")),
                   total=len(data["test"])):
                    negative_score = output[0]["score"]
                    positive_score = output[2]["score"]
                    assignment = np.argmax([negative_score, positive_score])
                    y_pred.append(assignment)

100%|██████████| 1066/1066 [02:31<00:00,  7.02it/s]


In [9]:
from sklearn.metrics import classification_report

def evaluate_performance(y_true, y_pred):
    performance = classification_report(y_true, y_pred, target_names = ["Negative Review", "Positive Review"])
    print(performance)

evaluate_performance(data["test"]["label"], y_pred)

                 precision    recall  f1-score   support

Negative Review       0.50      1.00      0.67       533
Positive Review       0.00      0.00      0.00       533

       accuracy                           0.50      1066
      macro avg       0.25      0.50      0.33      1066
   weighted avg       0.25      0.50      0.33      1066



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Method 2: Classification Task that Leverage Embedding


In [10]:
from sentence_transformers import SentenceTransformer

# Load Model
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

# convert text to embeddings
train_embeddings = model.encode(data["train"]["text"], show_progress_bar=True)
test_embeddings = model.encode(data["test"]["text"], show_progress_bar=True)

Batches:   0%|          | 0/267 [00:00<?, ?it/s]

Batches:   0%|          | 0/34 [00:00<?, ?it/s]

In [11]:
train_embeddings.shape

(8530, 768)

In [12]:
#Using a Logistic Regression Model

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(random_state=42)
lr.fit(train_embeddings, data["train"]["label"])

In [13]:
#Lets evaluate our model
y_pred = lr.predict(test_embeddings)
evaluate_performance(data["test"]["label"], y_pred)

                 precision    recall  f1-score   support

Negative Review       0.85      0.86      0.85       533
Positive Review       0.86      0.85      0.85       533

       accuracy                           0.85      1066
      macro avg       0.85      0.85      0.85      1066
   weighted avg       0.85      0.85      0.85      1066



### Method 3 - What if we do not have Labelled Data?

In [14]:
#create embeddings of the labels
label_embeddings = model.encode(["A negative movie review", "A positive movie review"])

In [15]:
from sklearn.metrics.pairwise import cosine_similarity

#create a best matching label for each document

sim_matrix = cosine_similarity(test_embeddings, label_embeddings)
y_pred = np.argmax(sim_matrix, axis=1)

evaluate_performance(data["test"]["label"], y_pred)


                 precision    recall  f1-score   support

Negative Review       0.83      0.76      0.79       533
Positive Review       0.78      0.85      0.81       533

       accuracy                           0.80      1066
      macro avg       0.80      0.80      0.80      1066
   weighted avg       0.80      0.80      0.80      1066

