In [None]:
!pip install datasets

In [None]:
!pip install --upgrade transformers

In [None]:
!pip install sentencepiece

In [None]:
!pip install protobuf

In [1]:
import os
import torch
import pandas as pd
from transformers import LlamaForCausalLM, LlamaTokenizer, AutoTokenizer
from datasets import load_dataset
from sklearn.metrics import classification_report
from tqdm import tqdm

In [2]:
dataset = 'sst2'
model_name = 'meta-llama/Llama-3.1-8B'
token = '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'
single_precision = True
gpu_id = 0
classes = ['negative', 'positive']
class_labels = {0: "negative", 1: "positive"}

# Updated prompt templates
prompts = [
    "Given the following text, does the sentiment lean more towards being positive or negative? Analyze the text carefully before answering.\nText: {}\nSentiment:",
    "What is the emotional sentiment conveyed by the following text? Indicate if it reflects a positive or negative sentiment.\nText: {}\nSentiment:",
    "Is the sentiment in this text generally favorable or unfavorable? Please provide your answer based on the tone of the text.\nText: {}\nSentiment:",
    "Does the following sentence express positive or negative opinion?\nText: {}\nSentiment:",
    "Classify the sentiment of the following sentence as either positive or negative.\nText: {}\nSentiment:"
]

In [3]:
# Set device and seed
torch.cuda.manual_seed(42)
torch.cuda.manual_seed_all(42)
torch.cuda.set_device(gpu_id)
device = torch.device(f'cuda:{gpu_id}' if torch.cuda.is_available() else 'cpu')
os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)

# Load model and tokenizer
def load_model_tokenizer(model_name, single_precision, token):
    model = LlamaForCausalLM.from_pretrained(model_name,
                                             cache_dir="cache/",
                                             torch_dtype=torch.float16 if single_precision else torch.float32,
                                             use_auth_token=token)
    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              cache_dir="cache/",
                                              use_auth_token=token,
                                              padding_side="left")
    tokenizer.add_special_tokens({'pad_token': '<PAD>'})
    model.resize_token_embeddings(len(tokenizer))
    model.config.pad_token_id = tokenizer.pad_token_id
    return model, tokenizer

model, tokenizer = load_model_tokenizer(model_name, single_precision, token)
model.to(device)
model.eval()

# Get the token indices for the class labels "positive" and "negative"
class_idx = {
    0: tokenizer.encode("negative", add_special_tokens=False)[0],
    1: tokenizer.encode("positive", add_special_tokens=False)[0]
}




Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [5]:
test_file_path = "data/test.tsv"

# Read the TSV file with the correct delimiter
test_data = pd.read_csv(test_file_path, sep='\t')

# Check the data structure
print(test_data.head())

   label                                            content
0      0     no movement , no yuks , not much of anything .
1      0  a gob of drivel so sickly sweet , even the eag...
2      0  gangs of new york is an unapologetic mess , wh...
3      0  we never really feel involved with the story ,...
4      1            this is one of polanski 's best films .


In [6]:
def classify_Mexample(sentence,label, prompt_template, maps, curr_prompt, curr_sentence):
    # Format the prompt with the review text

    prompt_text = prompt_template.format(sentence)

    # Encode the prompt and truncate to fit model's max length
    inputs = tokenizer(prompt_text, return_tensors="pt", padding=True, truncation=True).to(device)
    with torch.no_grad():
        logits = model(**inputs).logits

    # Extract the logits for the last token and apply softmax for binary classification
    last_token_logits = logits[:, -1, [class_idx[0], class_idx[1]]]
    probs = torch.softmax(last_token_logits, dim=-1)

    # Get predicted class (0 = negative, 1 = positive)
    predicted_class = torch.argmax(probs, dim=-1).item()
    if predicted_class == label:
        if maps[curr_sentence]['confidence'] < abs(probs[0][predicted_class].item() - 0.5):
            maps[curr_sentence]['confidence'] = abs(probs[0][predicted_class].item() - 0.5)
            maps[curr_sentence]['prompt'] = curr_prompt
    else :
        if maps[curr_sentence]['prompt'] == -1:
            maps[curr_sentence]['prompt'] = curr_prompt
            maps[curr_sentence]['confidence'] = 0

In [7]:
maps = {key: {"confidence": 0, "prompt": -1} for key in range(len(test_data["content"]))}
curr_prompt = 0
for prompt_template in prompts:
    print(f"Evaluating using prompt: {prompt_template}")

    # all_preds = []
    # all_labels = train_data["label"]  # Ground truth labels
    curr_sentence = 0
    
    for sentence in tqdm(test_data["content"]):
        classify_Mexample(sentence,test_data["label"][curr_sentence], prompt_template, maps, curr_prompt, curr_sentence) 
        curr_sentence += 1 
        # all_preds.append(pred)
    curr_prompt += 1 
    # print("Evaluation Metrics for the current prompt:")
    # print(classification_report(all_labels, all_preds, target_names=["negative", "positive"]))
    # print("\n" + "="*50 + "\n")


Evaluating using prompt: Given the following text, does the sentiment lean more towards being positive or negative? Analyze the text carefully before answering.
Text: {}
Sentiment:


100%|██████████| 1821/1821 [01:13<00:00, 24.63it/s]


Evaluating using prompt: What is the emotional sentiment conveyed by the following text? Indicate if it reflects a positive or negative sentiment.
Text: {}
Sentiment:


100%|██████████| 1821/1821 [01:13<00:00, 24.92it/s]


Evaluating using prompt: Is the sentiment in this text generally favorable or unfavorable? Please provide your answer based on the tone of the text.
Text: {}
Sentiment:


100%|██████████| 1821/1821 [01:13<00:00, 24.89it/s]


Evaluating using prompt: Does the following sentence express positive or negative opinion?
Text: {}
Sentiment:


100%|██████████| 1821/1821 [01:12<00:00, 25.19it/s]


Evaluating using prompt: Classify the sentiment of the following sentence as either positive or negative.
Text: {}
Sentiment:


100%|██████████| 1821/1821 [01:12<00:00, 25.04it/s]


In [8]:
def confidenceMethod(sentence, prompt_template, curr_sentence, maps):
    prompt_temp = prompt_template[maps[curr_sentence]['prompt']]
    prompt_text = prompt_temp.format(sentence)
    
    # Encode the prompt and truncate to fit model's max length
    inputs = tokenizer(prompt_text, return_tensors="pt", padding=True, truncation=True).to(device)
    with torch.no_grad():
        logits = model(**inputs).logits
    
    # Extract the logits for the last token and apply softmax for binary classification
    last_token_logits = logits[:, -1, [class_idx[0], class_idx[1]]]
    probs = torch.softmax(last_token_logits, dim=-1)
    
    # Get predicted class (0 = negative, 1 = positive)
    predicted_class = torch.argmax(probs, dim=-1).item()
    

    return predicted_class

In [9]:
all_preds = []
all_labels = test_data["label"]  # Ground truth labels
curr_sentence = 0
data = {
    "statement": [],
    "prompt": []
}

for sentence in tqdm(test_data["content"]):
    data["statement"].append(sentence)
    data["prompt"].append(maps[curr_sentence]['prompt'])
    pred = confidenceMethod(sentence, prompts, curr_sentence, maps)
    curr_sentence += 1
    all_preds.append(pred)
print("Evaluation Metrics for Oracle method:")
print(classification_report(all_labels, all_preds, target_names=["negative", "positive"], digits=4))
print("\n" + "="*50 + "\n")

100%|██████████| 1821/1821 [01:12<00:00, 24.98it/s]

Evaluation Metrics for Oracle method:
              precision    recall  f1-score   support

    negative     0.9986    0.7851    0.8791       912
    positive     0.8225    0.9989    0.9021       909

    accuracy                         0.8918      1821
   macro avg     0.9105    0.8920    0.8906      1821
weighted avg     0.9107    0.8918    0.8906      1821






