In [2]:
# !pip install transformers sentence-transformers openai
# !pip install -U datasets

In [3]:
!nvidia-smi

In [8]:
from datasets import load_dataset

In [9]:
data = load_dataset('rotten_tomatoes')
data

In [6]:
data["train"][1]

In [7]:
from transformers import pipeline

model_path = 'cardiffnlp/twitter-roberta-base-sentiment-latest'

pipe = pipeline(
    model=model_path,
    tokenizer=model_path,
    return_all_scores=True,
    device='cuda:0'
)

## Model Prediction Without Any Model training 

In [8]:
import numpy as np

In [9]:
import numpy as np
from tqdm import tqdm
from transformers.pipelines.pt_utils import KeyDataset

y_pred = []

for output in tqdm(pipe(KeyDataset(data["test"],"text")), total=len(data['test'])):
    negative_score = output[0]["score"]
    positive_score = output[2]["score"]
    assigment = int(np.argmax([negative_score, positive_score]))
    y_pred.append(assigment)

y_pred

In [10]:
y_pred[0]

In [15]:
y_true = []
for i in range(len(data['test'])):
    y_true.append(data['test'][i]['label'])

y_true

In [12]:
from sklearn.metrics import classification_report

print(classification_report(y_true=y_true, y_pred=y_pred, target_names=["Negative Review", "Positive Review"]))

## Model training by creating embeddings from a Language Model and then using those embeddings to train a classification model

In [13]:
from sentence_transformers import SentenceTransformer

In [14]:
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

model

In [15]:
train_embeddings = model.encode(data["train"]["text"], show_progress_bar=True)
test_embeddings = model.encode(data["test"]["text"], show_progress_bar=True)

In [16]:
test_embeddings[0]

In [17]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=1)
clf.fit(train_embeddings, data["train"]["label"])

In [18]:
y_pred = clf.predict(test_embeddings)
print(classification_report(y_true=y_true, y_pred=y_pred, target_names=["Negative Review", "Positive Review"]))

In [19]:
from sklearn.svm import SVC

clf = SVC(random_state=1)
clf.fit(train_embeddings, data["train"]["label"])

In [20]:
y_pred = clf.predict(test_embeddings)
print(classification_report(y_true=y_true, y_pred=y_pred, target_names=["Negative Review", "Positive Review"]))

## Use Cosine similarity of target labels and input texts to determine review lables, without model training

In [21]:
label_embeddings = model.encode(["A negative review","A positive reviw"])

In [22]:
label_embeddings.shape

In [23]:
from sklearn.metrics.pairwise import cosine_similarity

sim_matrix = cosine_similarity(test_embeddings,label_embeddings)
sim_matrix

In [24]:
y_pred = np.argmax(sim_matrix, axis=1)
print(classification_report(y_true=y_true, y_pred=y_pred, target_names=["Negative Review", "Positive Review"]))

In [25]:
label_embeddings_modified = model.encode(["An extremely negative review","An extremely positive reviw"])
sim_matrix = cosine_similarity(test_embeddings,label_embeddings_modified)
y_pred = np.argmax(sim_matrix, axis=1)
print(classification_report(y_true=y_true, y_pred=y_pred, target_names=["Negative Review", "Positive Review"]))

## Classification using Generative Models

### flan-t5-small

In [27]:
pipe_falant5 = pipeline(
    "text2text-generation",
    model="google/flan-t5-small",
    device="cuda:0"
)
pipe_falant5

In [28]:
prompt = "Is the following sentence positive or negative"
data = data.map(lambda x : {"t5": prompt + x["text"]})

data

In [31]:
y_pred = []

for output in tqdm(pipe_falant5(KeyDataset(data['test'],"t5")), total=len(data["test"])):
    response = output[0]['generated_text']
    response = 0 if response=="negative" else 1
    y_pred.append(response)

y_pred

In [32]:
print(classification_report(y_true=y_true, y_pred=y_pred, target_names=["Negative Review", "Positive Review"]))

## Chat GPT

In [None]:
# from getpass import getpass
# openai_api_key = getpass()

In [None]:
import openai

client = openai.OpenAI(api_key='')
client

In [3]:
def chatgpt_generation(prompt, document, model="gpt-3.5-turbo-0125"):
    """Generate an output based on a prompt and an input document."""
    messages=[
        {
            "role": "system",
            "content": "You are a helpful assistant."
            },
        {
            "role": "user",
            "content":   prompt.replace("[DOCUMENT]", document)
            }
    ]
    chat_completion = client.chat.completions.create(
      messages=messages,
      model=model,
      temperature=0
    )
    return chat_completion.choices[0].message.content

In [4]:
prompt = """Predict whether the following document is a positive or negative movie review:

[DOCUMENT]

If it is positive return 1 and if it is negative return 0. Do not give any other answers.
"""

# Predict the target using GPT
document = "unpretentious , charming , quirky , original"
chatgpt_generation(prompt, document)

In [10]:
predictions = [chatgpt_generation(prompt, doc) for doc in tqdm(data["test"]["text"])]

In [16]:
y_pred = [int(response) for response in predictions]
print(classification_report(y_true=y_true, y_pred=y_pred, target_names=["Negative Review", "Positive Review"]))

In [17]:
from google.colab import runtime
runtime.unassign()