### 1. Text Classification with Representation Models

#### 1.1 Load data

In [4]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
xdata = load_dataset("rotten_tomatoes")
data

Generating train split: 100%|████| 8530/8530 [00:00<00:00, 482793.51 examples/s]
Generating validation split: 100%|█| 1066/1066 [00:00<00:00, 285786.39 examples/
Generating test split: 100%|█████| 1066/1066 [00:00<00:00, 325480.68 examples/s]


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
})

In [14]:
data['train'][0,-1]

{'text': ['the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .',
  'things really get weird , though not particularly scary : the movie is all portent and no content .'],
 'label': [1, 0]}

#### 1.2 Import pre-trained model

In [27]:
from transformers import pipeline

model_path = "cardiffnlp/twitter-roberta-base-sentiment-latest"

pipe = pipeline(
    model=model_path,
    tokenizer=model_path,
    return_all_scores=True,
    device="cpu"
)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


#### 1.3 Run inference


In [32]:
import numpy as np
from tqdm import tqdm
from transformers.pipelines.pt_utils import KeyDataset

y_pred = []
for output in tqdm(pipe(KeyDataset(data["test"], "text")),
total = len(data["test"])):
    negative_score = output[0]["score"]
    positive_score = output[2]["score"]
    assignment = np.argmax([negative_score, positive_score])
    y_pred.append(assignment)

100%|███████████████████████████████████████| 1066/1066 [00:25<00:00, 42.11it/s]


#### 1.4 Evaluation

In [43]:
from sklearn.metrics import classification_report

def evaluate_perfomance(y_true, y_pred):
    perfomance = classification_report(
        y_true, y_pred,
        target_names=["Negative Reviews", "Positive Reviews"]
    )
    print(perfomance)

evaluate_perfomance(data["test"]["label"], y_pred)

                  precision    recall  f1-score   support

Negative Reviews       0.76      0.88      0.81       533
Positive Reviews       0.86      0.72      0.78       533

        accuracy                           0.80      1066
       macro avg       0.81      0.80      0.80      1066
    weighted avg       0.81      0.80      0.80      1066



### 2. Text Classification with Embeddings

#### 2.1 Load model and create embeddings


In [49]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

train_embeddings = model.encode(data["train"]["text"],show_progress_bar=True)
test_embeddings = model.encode(data["test"]["text"],show_progress_bar=True)

Batches: 100%|████████████████████████████████| 267/267 [00:33<00:00,  8.06it/s]
Batches: 100%|██████████████████████████████████| 34/34 [00:03<00:00,  8.94it/s]


In [50]:
train_embeddings.shape, test_embeddings.shape

((8530, 768), (1066, 768))

#### 2.2 Train Logistic Regresssion

In [54]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=42)
clf.fit(train_embeddings, data["train"]["label"])

#### 2.3 Evaluate

In [55]:
y_pred = clf.predict(test_embeddings)
evaluate_perfomance(data["test"]["label"], y_pred)

                  precision    recall  f1-score   support

Negative Reviews       0.85      0.86      0.85       533
Positive Reviews       0.86      0.85      0.85       533

        accuracy                           0.85      1066
       macro avg       0.85      0.85      0.85      1066
    weighted avg       0.85      0.85      0.85      1066



### 3. Text Classification without Labelled Data

#### 3.1 Create label description basesd on it's meaning

In [58]:
label_embeddings = model.encode(["A negative review", "A positive review"])

#### 3.2 Use cosine similarity to check similiarity between the doc and the label desc

In [59]:
from sklearn.metrics.pairwise import cosine_similarity

sim_matrix = cosine_similarity(test_embeddings, label_embeddings)
y_pred = np.argmax(sim_matrix, axis=1)

#### 3.3 Evaluate

In [60]:
evaluate_perfomance(data["test"]["label"], y_pred)

                  precision    recall  f1-score   support

Negative Reviews       0.78      0.77      0.78       533
Positive Reviews       0.77      0.79      0.78       533

        accuracy                           0.78      1066
       macro avg       0.78      0.78      0.78      1066
    weighted avg       0.78      0.78      0.78      1066

