# LLMs for Classification Workshops

### Workshops
- Amazon Bedrock [https://catalog.us-east-1.prod.workshops.aws/workshops/a4bdb007-5600-4368-81c5-ff5b4154f518/en-US]
- Structured Prompting [https://github.com/stephenhibbert/instructor/tree/main/docs/tutorials]
- Prompt Engineering [https://www.promptingguide.ai/]

### Homework
- Implement a predictor using the instructor AnthropicBedrock client

In [None]:
! pip install --upgrade pip

In [None]:
! pip install -qU datasets scikit-learn matplotlib
! pip install -qU transformers torch torchvision torchaudio

In [None]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("fancyzhx/ag_news")
dataset = dataset.shuffle(seed=52)
train_dataset = dataset['train']

# Take a sample of 100 examples from the train dataset
train_sample = train_dataset.select(range(100))

# Verify the sample size
print(f"Train sample size: {len(train_sample)}")

# Check the first few entries of the train sample
print(train_sample[:3])
train_sample

In [None]:
import torch
from tqdm import tqdm
from transformers import pipeline
import pandas as pd
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd

def evaluate_predictions(true_labels, predictions):
    accuracy = accuracy_score(true_labels, predictions)
    return {'accuracy': accuracy}

label_names = ["World", "Sports", "Business", "Sci/Tech"]
labels = [0, 1, 2, 3]  # World (0), Sports (1), Business (2), Sci/Tech (3)

# Create id2label and label2id dictionaries
id2label = {i: label_names[i] for i in labels}
label2id = {label_names[i]: i for i in labels}

def id2label_func(id):
    return id2label.get(id, "Unknown")

def label2id_func(label):
    return label2id.get(label, -1)

def run_experiment(predict_func, evaluate_func, dataset_sample):
    
    # Make predictions for the test set
    predictions = predict_func(dataset_sample)
    
    # Extract true labels from the test set
    true_labels = dataset_sample['label']
    
    # Evaluate the predictions
    evaluation_results = evaluate_func(true_labels, predictions)
    
    # Create a summary DataFrame
    results = pd.DataFrame({
        'True Label': true_labels,
        'Prediction': predictions
    })
    
    # Display the performance
    print(f"Experiment Results: {evaluation_results}")
    
    return results, evaluation_results

def plot_results(results):
    # Generate the confusion matrix
    cm = confusion_matrix(results['True Label'], results['Prediction'], labels=[0, 1, 2, 3])

    # Plot the confusion matrix
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['World', 'Sports', 'Business', 'Sci/Tech'])
    disp.plot(cmap=plt.cm.Blues)
    plt.title('Confusion Matrix')
    plt.show()
    
    # Plot the distribution of predictions
    plt.figure(figsize=(10, 6))
    results['Prediction'].value_counts().sort_index().plot(kind='bar', color='skyblue')
    plt.xlabel('Label')
    plt.ylabel('Frequency')
    plt.title('Distribution of Random Predictions')
    plt.xticks(ticks=[0, 1, 2, 3], labels=['World', 'Sports', 'Business', 'Sci/Tech'], rotation=0)
    plt.grid(axis='y')
    plt.show()

In [None]:
def random_predictor(test_data):
    return [random.choice(labels) for _ in range(len(test_data))]

# Run the experiment with the random predictor
results, evaluation_results = run_experiment(random_predictor, evaluate_predictions, train_sample)
plot_results(results)

### Now we can try some different predictor function.... how about always predicting the same thing?

In [None]:
def constant_predictor(test_data):
    predicted_labels = []
    for _ in range(len(test_data)):
        predicted_labels.append(0) # How about if we just always predict 0?
    return predicted_labels

results, evaluation_results = run_experiment(constant_predictor, evaluate_predictions, train_sample)
plot_results(results)
# Pretty bad, but maybe slightly better than random 🙃

Okay, let's try to be a bit smarter - I know you're very comfortable deploying supervised learning algos but I want you to take a purely transfer learning approach for now. 
We will apply some pretrained large models and experiment with prompt engineering before visiting any training from scratch or fine-tuning.

 Let's grab a large(ish) model from Hugging Face and see how it performs on the task of zero shot text classification
 https://huggingface.co/tasks/zero-shot-classification

 Have a read about the BART model architecture here: https://arxiv.org/pdf/1910.13461
 
*We present BART, a denoising autoencoder for pretraining sequence-to-sequence models.
BART is trained by (1) corrupting text with an
arbitrary noising function, and (2) learning a
model to reconstruct the original text. It uses
a standard Tranformer-based neural machine
translation architecture which, despite its simplicity, can be seen as generalizing BERT (due
to the bidirectional encoder), GPT (with the
left-to-right decoder), and many other more recent pretraining schemes
We'll start by copying the example in the link above*

In [None]:
from transformers import pipeline

# https://huggingface.co/docs/transformers/main/en/main_classes/pipelines#transformers.ZeroShotClassificationPipeline

pipe = pipeline(model="facebook/bart-large-mnli")
output = pipe("I have a problem with my iphone that needs to be resolved asap!",
    candidate_labels=["urgent", "not urgent", "phone", "tablet", "computer"]
)

print(output)

In [None]:
! pip install -qU tqdm

In [None]:
# Let's speed things up - use the GPU and the built in map function

import torch
from tqdm import tqdm
from transformers import pipeline

# Check if GPU is available
device = 0 if torch.cuda.is_available() else -1

# Assuming pipe is a transformers pipeline
pipe = pipeline('zero-shot-classification', model='facebook/bart-large-mnli', device=device)

def bart_large_mnli_predictor(test_data):
    predicted_labels = []
    for datum in tqdm(test_data, desc="Processing"):
        output = pipe(datum['text'], candidate_labels=['World', 'Sports', 'Business', 'Sci/Tech'])
        max_label = output['labels'][output['scores'].index(max(output['scores']))]
        predicted_labels.append(label2id_func(max_label))
    return predicted_labels

# Ensure the lambda function matches the expected input of run_experiment
results, evaluation_results = run_experiment(bart_large_mnli_predictor, evaluate_predictions, train_sample)
plot_results(results)

### Use Anthropic Claude 3 on Bedrock
Apply the examples from the instructor structured prompting notebooks to this. Implement the `claude_haiku_predictor` function below

In [None]:
def claude_haiku_predictor(test_data):
    raise Exception("Not implemented")

### (Optional) What about a capable open source LLM? See if you can adapt this to work with Llama3.1

The following uses a small Llama 3.1 model, note you'll need to accept the T&C's and wait a few mins to be able to use the model as it's gated on the HF hub.
https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct