# Chapter04

In [None]:
from transformers import T5Tokenizer,T5ForConditionalGeneration

model_name = "t5-small"
model =T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

In [None]:
from transformers import pipeline

pipe = pipeline(
    task="text2text-generation",
    model=model,
    tokenizer=tokenizer,
    device="cuda:0"
)

In [None]:
from datasets import load_dataset
data = load_dataset("rotten_tomatoes")
data

In [None]:
prompt = "Is the following sentence positive or negative?"
data = data.map(lambda example: {"t5": prompt + example["text"]})
data

In [None]:
from sklearn.metrics import classification_report

def evaluate_performance(y_true, y_pred):
    """Create and print the classification report"""
    performance = classification_report(
        y_true, y_pred,
        target_names=["Negative Review", "Positive Review"]
    )
    print(performance)

In [None]:
import numpy as np
from tqdm import tqdm
from transformers.pipelines.pt_utils import KeyDataset


y_pred = []
for output in tqdm(pipe(KeyDataset(data["test"], "t5")), total=len(data["test"])):
    text = output[0]["generated_text"]
    y_pred.append(0 if text == "negative" else 1)



In [None]:
evaluate_performance(data["test"]["label"], y_pred)

# Chapter 05

In [None]:
from datasets import load_dataset
dataset = load_dataset("maartengr/arxiv_nlp")["train"]

In [None]:
dataset

In [None]:
abstracts = dataset["Abstracts"]
titles = dataset["Titles"]

In [None]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("thenlper/gte-small")
embeddings = embedding_model.encode(abstracts, show_progress_bar=True)

In [None]:
embeddings.shape

In [None]:
from umap import UMAP

umap_model = UMAP(n_components= 5, min_dist=0.0, metric="cosine", random_state=42)
reduced_embeddings =umap_model.fit_transform(embeddings)

In [None]:
from hdbscan import HDBSCAN
hdbscan_model = HDBSCAN(min_cluster_size=50, metric="euclidean", cluster_selection_method="eom").fit(reduced_embeddings)
clusters =hdbscan_model.labels_
len(set(clusters))

In [None]:
import pandas as pd

# Reduce 384-dimensional embeddings to 2 dimensions for easier visualization
reduced_embeddings = UMAP(
    n_components=2, min_dist=0.0, metric='cosine', random_state=42
).fit_transform(embeddings)

# Create dataframe
df = pd.DataFrame(reduced_embeddings, columns=["x", "y"])
df["title"] = titles
df["cluster"] = [str(c) for c in clusters]

# Select outliers and non-outliers (clusters)
clusters_df = df.loc[df.cluster != "-1", :]
outliers_df = df.loc[df.cluster == "-1", :]

import matplotlib.pyplot as plt

# Plot outliers and non-outliers seperately
plt.scatter(outliers_df.x, outliers_df.y, alpha=0.05, s=2, c="grey")
plt.scatter(
    clusters_df.x, clusters_df.y, c=clusters_df.cluster.astype(int),
    alpha=0.6, s=2, cmap='tab20b'
)
plt.axis('off')

# Chapter 06

In [1]:
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM

model_name = "microsoft/Phi-3-mini-4k-instruct"

model = AutoModelForCausalLM.from_pretrained(model_name,
                                             device_map="cuda",
                                             torch_dtype="auto",
                                             trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)

pipe = pipeline(task="text-generation",
                model=model,
                tokenizer=tokenizer,
                return_full_text=False,
                max_new_tokens=500,
                do_sample=False)

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [21]:
#prompt
messages = [{"role": "user",
             "content": "Create a funny joke about chickens."}]
#generate the output
output = pipe(messages)
output[0]["generated_text"]

' Why did the chicken join the band? Because it had the drumsticks!'

In [23]:
prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False)
print(prompt)

<|user|>
Create a funny joke about chickens.<|end|>
<|endoftext|>
