In [None]:
!pip install datasets google-cloud-aiplatform==1.25.0

In [None]:
import vertexai
from vertexai.preview.language_models import TextGenerationModel

In [None]:
import queue
import threading
import time

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix

Replace `YOUR_PROJECT_ID` with your project ID in the cell below.

In [None]:
PROJECT_ID="YOUR_PROJECT_ID"
LOCATION="us-central1"

In [None]:
vertexai.init(project=PROJECT_ID, location=LOCATION)

In [None]:
def predict_large_language_model(
    model_name: str,
    temperature: float,
    max_decode_steps: int,
    top_p: float,
    top_k: int,
    content: str,
    tuned_model_name: str = "",
    ) :
    """Predict using a Large Language Model."""
    
    model = TextGenerationModel.from_pretrained(model_name)
    if tuned_model_name:
      model = model.get_tuned_model(tuned_model_name)
    response = model.predict(
        content,
        temperature=temperature,
        max_output_tokens=max_decode_steps,
        top_k=top_k,
        top_p=top_p,)
    return response.text

In [None]:
prompt = '''input: I had to compare two versions of Hamlet for my Shakespeare class and unfortunately I picked this version. Everything from the acting (the actors deliver most of their lines directly to the camera) to the camera shots (all medium or close up shots...no scenery shots and very little back ground in the shots) were absolutely terrible. I watched this over my spring break and it is very safe to say that I feel that I was gypped out of 114 minutes of my vacation. Not recommended by any stretch of the imagination.
Classify the sentiment of the message: negative

input: This Charles outing is decent but this is a pretty low-key performance. Marlon Brando stands out. There\'s a subplot with Mira Sorvino and Donald Sutherland that forgets to develop and it hurts the film a little. I\'m still trying to figure out why Charlie want to change his name.
Classify the sentiment of the message: negative

input: My family has watched Arthur Bach stumble and stammer since the movie first came out. We have most lines memorized. I watched it two weeks ago and still get tickled at the simple humor and view-at-life that Dudley Moore portrays. Liza Minelli did a wonderful job as the side kick - though I\'m not her biggest fan. This movie makes me just enjoy watching movies. My favorite scene is when Arthur is visiting his fiancée\'s house. His conversation with the butler and Susan\'s father is side-spitting. The line from the butler, "Would you care to wait in the Library" followed by Arthur\'s reply, "Yes I would, the bathroom is out of the question", is my NEWMAIL notification on my computer.
Classify the sentiment of the message: positive

input: {review}
Classify the sentiment of the message: 
'''

In [None]:
review = "Something surprised me about this movie - it was actually original. It was not the same old recycled crap that comes out of Hollywood every month. I saw this movie on video because I did not even know about it before I saw it at my local video store. If you see this movie available - rent it - you will not regret it."
content = prompt.format(review=review)

response_text = predict_large_language_model(
    "text-bison@001", 
    temperature=0.2, 
    max_decode_steps=5, 
    top_p=0.8, 
    top_k=1, 
    content=content)
response_text

'positive'

In [None]:
def classify_review(review):
    content = prompt.format(review=review)
    response_text = predict_large_language_model(
        "text-bison@001", 
        temperature=0.2, 
        max_decode_steps=5, 
        top_p=0.8, 
        top_k=1, 
        content=content)
    if response_text.lower() == 'negative':
        return 0
    elif response_text.lower() == 'positive':
        return 1
    else:
        return 2

In [None]:
from datasets import load_dataset
dataset = load_dataset("imdb")

In [None]:
review = dataset['test'][0]['text']
review

'I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn\'t match the background, and painfully one-dimensional characters cannot be overcome with a \'sci-fi\' setting. (I\'m sure there are those of you out there who think Babylon 5 is good sci-fi TV. It\'s not. It\'s clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It\'s really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it\'s rubbish as they have

In [None]:
classify_review(review)

0

In [None]:
df_test = pd.DataFrame(dataset['test'])
df_test.head()

Unnamed: 0,text,label
0,I love sci-fi and am willing to put up with a ...,0
1,"Worth the entertainment value of a rental, esp...",0
2,its a totally average film with a few semi-alr...,0
3,STAR RATING: ***** Saturday Night **** Friday ...,0
4,"First off let me say, If you haven't enjoyed a...",0


In [None]:
df_test.label.value_counts()

0    12500
1    12500
Name: label, dtype: int64

In [None]:
def execute_threaded_task():
    index, review, label = my_queue.get()
    result = None
    try:
        result = classify_review(review) # calls predict_large_language_model
        if result is not None:
            results.append((index, review, label, result))
    except:
        print(f"Error classifying index {index}")
    if result is None:
        timeout_indexes.append(index)
    my_queue.task_done()

In [None]:
# If you want to test with a subsample of reviews, uncomment these lines, and comment out df_test_sample = df_test
# NUM_SAMPLES = 100
# indexes = np.random.randint(low=0,high=25000,size=NUM_SAMPLES)
# df_test_sample = df_test.iloc[indexes]

df_test_sample = df_test

# use this to throttle request rates
REQUESTS_PER_SECOND = .8
sleep_interval = 1 / REQUESTS_PER_SECOND

In [None]:
my_queue = queue.Queue()
my_queue.queue.clear()
results = []
timeout_indexes = []

start_time = time.time()
for index, row in df_test_sample.iterrows():
    my_queue.put((index, row.text, row.label)) 
    t = threading.Thread(target=execute_threaded_task, daemon=True)
    t.start()
    time.sleep(sleep_interval) 

my_queue.join()   # hang out until all threads are complete
elapsed = time.time() - start_time
print(f"{len(df_test_sample)} samples took {elapsed / 60} minutes")

25000 samples took 521.8054098725319 minutes


In [None]:
results_df = pd.DataFrame(sorted(results),columns=["original_index","review", "label", "result"])
results_df.head()

Unnamed: 0,original_index,review,label,result
0,0,I love sci-fi and am willing to put up with a ...,0,0
1,1,"Worth the entertainment value of a rental, esp...",0,2
2,2,its a totally average film with a few semi-alr...,0,0
3,3,STAR RATING: ***** Saturday Night **** Friday ...,0,0
4,4,"First off let me say, If you haven't enjoyed a...",0,1


In [None]:
cm = confusion_matrix(y_true=results_df.label, y_pred=results_df.result)
class_names = ["negative","positive","bad_response"]

In [None]:
df_cm = pd.DataFrame(cm, index=class_names, columns=class_names)
df_cm

Unnamed: 0,negative,positive,bad_response
negative,9760,544,2196
positive,284,10960,1256
bad_response,0,0,0


In [None]:
len(timeout_indexes) # we didn't experience any timeouts

0

In [None]:
accuracy_score(y_true=results_df.label, y_pred=results_df.result)

0.8288

In [None]:
no_bad_responses_df = results_df[results_df.result != 2]
accuracy_score(y_true=no_bad_responses_df.label, y_pred=no_bad_responses_df.result)

0.9615741600148505

In [None]:
# percentage of bad responses
len(results_df[results_df["result"] == 2]) / len(results_df)

0.13808