 Notes before running code:


*   Make sure to start the session by going to edit -> notebook settings -> and select the T4 GPU as the hardware accelerator.
*    When you run the installs, a warning window will pop up telling you to restart your session. Wait until the cell is done running before accepting.





In [None]:
!pip install -U keras keras-nlp tensorflow datasets
!pip install -q bertopic sentence-transformers umap-learn hdbscan==0.8.33

In [None]:
!pip install -U numpy
!pip install --force-reinstall hdbscan

In [None]:
from datasets import load_dataset
import pandas as pd

dataset = load_dataset("SetFit/bbc-news")
df = pd.DataFrame(dataset["train"])

df.head()


Using BERTopic

In [None]:
import hdbscan
from bertopic import BERTopic

df.columns
docs = df["text"].tolist()

hdbscan_model = hdbscan.HDBSCAN(
    min_cluster_size=70,      # bigger = fewer topics
    min_samples=2,            # bigger = more points classified as noise
    metric="euclidean",
    cluster_selection_method="eom"
)

topic_model = BERTopic(
    embedding_model="all-MiniLM-L6-v2",
    hdbscan_model=hdbscan_model,
    verbose=True
)

topics, probs = topic_model.fit_transform(docs)

In [None]:
df["bertopic_topic"] = topics
pd.crosstab(df["label_text"], df["bertopic_topic"])

**Fine Tune an LLM for topic classification**

In [None]:
import pandas as pd
import os
import openpyxl
import numpy as np
import re

In [None]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')


num_classes = len(df["label_text"].value_counts())

colors = plt.cm.Dark2(np.linspace(0, 1, num_classes))
iter_color = iter(colors)

df['label_text'].value_counts().plot.barh(title="Topic (n, %)",
                                                 ylabel="Topic Name",
                                                 color=colors,
                                                 figsize=(9,9))

for i, v in enumerate(df['label_text'].value_counts()):
  c = next(iter_color)
  plt.text(v, i,
           " "+str(v)+", "+str(round(v*100/df.shape[0],2))+"%",
           color=c,
           va='center',
           fontweight='bold')

In [None]:
from sklearn.model_selection import train_test_split
import tensorflow as tf
import keras_nlp
# Add an index column to track original positions
df["original_index"] = df.index
y = tf.keras.utils.to_categorical(df["label"].values, num_classes=5)

x_train, x_test, y_train, y_test = train_test_split(df['text'], y, test_size=0.25, random_state=42, shuffle=True)

In [None]:
sequence_length = 512     # Sets the maximum number of tokens per input sequence

# Preprocessor (Creates text preprocessing pipeline)
bert_preprocess = keras_nlp.models.BertPreprocessor.from_preset(
    "bert_base_en_uncased",
    sequence_length=sequence_length
)

# encoder (brains of BERT)
bert_encoder = keras_nlp.models.BertBackbone.from_preset(
    "bert_base_en_uncased")


# Model inputs
input_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32, name="input_ids")
padding_mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32, name="padding_mask")
segment_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32, name="segment_ids")

bert_encoder.trainable = False    # Specifies that the bert encoder is frozen and not fine tuned

x = bert_encoder({
    "token_ids": input_ids,
    "padding_mask": padding_mask,
    "segment_ids": segment_ids
})["pooled_output"]
x = tf.keras.layers.Dropout(0.2)(x)   # specifies dropout regularization to reduce chance of overfitting
outputs = tf.keras.layers.Dense(5, activation="softmax")(x) # adds the classifier heads that will assign input to one of the five categories

model = tf.keras.Model([input_ids, padding_mask, segment_ids], outputs)
model.summary()


In [None]:
x_train = tf.convert_to_tensor(x_train, dtype=tf.string)
x_test  = tf.convert_to_tensor(x_test, dtype=tf.string)
y_train = tf.convert_to_tensor(y_train, dtype=tf.float32)
y_test  = tf.convert_to_tensor(y_test, dtype=tf.float32)
# Convert raw strings into numeric token IDs + masks
x_train_tokens = bert_preprocess(x_train)  # returns dict of tensors
x_test_tokens  = bert_preprocess(x_test)
y_train_int = np.argmax(y_train, axis=1)
y_test_int  = np.argmax(y_test, axis=1)

In [None]:
# Number of passes over the training data
n_epochs = 10

# Stops the model from continuing if not improving
earlystop_callback = tf.keras.callbacks.EarlyStopping(monitor = "val_loss",
                                                      patience = 3,
                                                      restore_best_weights = True)

# How to train the model
model.compile(optimizer = "adam",
              loss = "sparse_categorical_crossentropy",
              metrics = ["accuracy"])

# Actual training call
model_fit = model.fit(
    [x_train_tokens["token_ids"], x_train_tokens["padding_mask"], x_train_tokens["segment_ids"]],
    y_train_int,
    validation_data=(
        [x_test_tokens["token_ids"], x_test_tokens["padding_mask"], x_test_tokens["segment_ids"]],
        y_test_int
    ),
    epochs=n_epochs,
    batch_size=8,
    callbacks=[earlystop_callback]
)


In [None]:
from sklearn.metrics import classification_report

y_pred = model.predict([x_test_tokens["token_ids"], x_test_tokens["padding_mask"], x_test_tokens["segment_ids"]])
y_pred_int = y_pred.argmax(axis=1)

print(classification_report(y_test_int, y_pred_int))

In [None]:
import numpy as np

# Predict probabilities on test set
y_test_probs = model.predict([
    x_test_tokens["token_ids"],
    x_test_tokens["padding_mask"],
    x_test_tokens["segment_ids"]
])
# Convert to predicted class integers
y_test_pred = np.argmax(y_test_probs, axis=1)

# Predict on training set
y_train_probs = model.predict([
    x_train_tokens["token_ids"],
    x_train_tokens["padding_mask"],
    x_train_tokens["segment_ids"]
])
y_train_pred = np.argmax(y_train_probs, axis=1)

df_all = pd.DataFrame({
    "text": np.concatenate([x_train, x_test]),
    "actual": np.concatenate([y_train_int, y_test_int]),
    "predicted": np.concatenate([y_train_pred, y_test_pred])
})


In [None]:

class_names = ["tech", "business", "sport", "entertainment", "politics"]
df_all["actual_name"] = df_all["actual"].map(lambda x: class_names[x])
df_all["predicted_name"] = df_all["predicted"].map(lambda x: class_names[x])

df_all.head()

Using ChatGPT

In [None]:
print(df["text"].iloc[0])


In [None]:

# Step 2: Import libraries
import openai
import getpass

# Enter API key securely
openai.api_key = getpass.getpass("Enter your OpenAI API key: ")


In [None]:
response = openai.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "user", "content": "respond with one word. Is the following text about entertainment, tech, politics, sport, or business? text: wales want rugby league training wales could follow england s lead by training with a rugby league club.  england have already had a three-day session with leeds rhinos  and wales are thought to be interested in a similar clinic with rivals st helens. saints coach ian millward has given his approval  but if it does happen it is unlikely to be this season. saints have a week s training in portugal next week  while wales will play england in the opening six nations match on 5 february.  we have had an approach from wales   confirmed a saints spokesman.  it s in the very early stages but it is something we are giving serious consideration to.  st helens  who are proud of their welsh connections  are obvious partners for the welsh rugby union  despite a spat in 2001 over the collapse of kieron cunningham s proposed Â£500 000 move to union side swansea. a similar cross-code deal that took iestyn harris from leeds to cardiff in 2001 did go through  before the talented stand-off returned to the 13-man code with bradford bulls. kel coslett  who famously moved from wales to league in the 1960s  is currently saints  football manager  while clive griffiths - wales  defensive coach - is a former st helens player and is thought to be the man behind the latest initiative. scott gibbs  the former wales and lions centre  played for st helens from 1994-96 and was in the challenge cup-winning team at wembley in 1996."}
    ],
    max_tokens=50
)

print(response.choices[0].message.content)


In [None]:
# Fixed prompt to prepend
fixed_prompt = "respond with one word. Is the following text about entertainment, tech, politics, sport, or business? text: "

# Loop through the first 10 rows and submit to OpenAI
for i, text in enumerate(df['text'][:10]):
    prompt = fixed_prompt + text
    response = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=50
    )
    summary = response.choices[0].message.content
    print(f"Row {i}:")
    print(f"Input: {text}")
    print(f"Output: {summary}")
    print("-----")

In [None]:
print(df["label_text"].iloc[0:10])
