# Classification using large language models


### Goal: 

Given a bunch of text snippets written by either AI or a human, the goal is to post-train a large language model and make it accurately predict whether a text snippet is AI-generated or not.



******************
$\textit{Author:}$ Simon Guldager \
$\textit{Date:}$ 1-10-2024

In [1]:
import textwrap

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# TensorFlow and transformers
import tensorflow as tf
from transformers import BertTokenizer, TFAutoModelForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


some useful functions

In [3]:
def print_row(data, idx, target_column, symbols_per_line=100, max_lines = None):
    """
    Print a row of a DataFrame with word wrapping

    Parameters
    ----------
    data : DataFrame
        The DataFrame containing the data to be printed
    idx : int
        The index of the row to be printed
    target_column : str
        The name of the column to be printed
    symbols_per_line : int
        The number of symbols per line at which to wrap the text
    
    Returns
    -------
    None
        
    """
    value = str(data[target_column].iloc[idx])

    # Remove excess spaces
    value = ' '.join(value.split())
    
    # Wrap text while respecting word boundaries
    wrapped_value = textwrap.fill(value, width=symbols_per_line)

    if max_lines is not None:
        wrapped_value = '\n'.join(wrapped_value.split('\n')[:max_lines])
    
    print(wrapped_value)
    return

def print_list(list, elements_per_line=10):
    """
    Print a list with a specified number of elements per line
    """

    list_range = range(0, len(list), elements_per_line)

    for i in list_range:
        print(*list[i:i+elements_per_line])
    return

# Define a function to find unique words in a text
def find_unique_words(text, word_list):
    """
    Find unique words in a text. Append the unique words to a list.
    """
    for word in text.split():
        if word not in word_list:
            word_list.append(word)
    return

# print table summary with total samples in each dataset, positive samples, and negative samples
def pretty_print_info_table(labels_train, labels_test, validation=False):
    """  
    This function was made by Anton Golles.
    """


    val_str = 'Val.' if validation else 'Test'
    data_for_table = {
        'train': {
            'total': len(labels_train),
            'pos': np.sum(labels_train),
            'neg': len(labels_train) - np.sum(labels_train)
        },
        'test': {
            'total': len(labels_test),
            'pos': np.sum(labels_test),
            'neg': len(labels_test) - np.sum(labels_test)
        }
    }
    data_for_table['total'] = {
        'total': data_for_table['train']['total'] + data_for_table['test']['total'],
        'pos': data_for_table['train']['pos'] + data_for_table['test']['pos'],
        'neg': data_for_table['train']['neg'] + data_for_table['test']['neg']
    }

    print("""
    Info table:
    +----------------+---------+---------+-------+
    |                | Training| {}    | Total
    +----------------+---------+---------+-------+
    | Total samples  | {:7d} | {:7d} | {:7d} |
    | Pos. samples   | {:7d} | {:7d} | {:7d} |  (AI generated)
    | Neg. samples   | {:7d} | {:7d} | {:7d} |  (Human written)
    """.format(
        val_str,
        data_for_table['train']['total'],
        data_for_table['test']['total'],
        data_for_table['total']['total'],
        data_for_table['train']['pos'],
        data_for_table['test']['pos'],
        data_for_table['total']['pos'],
        data_for_table['train']['neg'],
        data_for_table['test']['neg'],
        data_for_table['total']['neg'],
        
        ))
    return

def plot_classificiation_results(history):

    # make a list of the train and val metrics
    metrics = list(history.history.keys())
    
    # make lists of train and val metrics
    val_metrics = [entry for entry in metrics if entry.startswith('val_')]
    train_metrics = [entry for entry in metrics if not entry.startswith('val_')]

    # the number of metrics to plot
    Nmetrics = len(val_metrics)
    width = 6 * Nmetrics

    fig, ax = plt.subplots(ncols = Nmetrics, figsize=(width, 6))

    # plot the train and val results for each metric
    for i, axx in enumerate(ax):
        ax[i].plot(history.history[train_metrics[i]], label='train', alpha = 0.7)  
        ax[i].plot(history.history[val_metrics[i]], label='validation', alpha = 0.7)
        ax[i].set_ylabel(f'{train_metrics[i].capitalize()}')
        ax[i].set_xlabel('Epoch')
        ax[i].legend(loc='best')
    fig.tight_layout()
    return fig, ax 

def evaluate_binary_classification_results(model, X_train, y_train, X_val, y_val, X_test = None, y_test = None,\
                                           metrics = [accuracy_score], metric_names = ['Accuracy']):
    
    # make predictions
    y_pred_train = model.predict(X_train, verbose = 0)   
    y_pred_val = model.predict(X_val, verbose = 0)

    # if test data is provided, make predictions
    if X_test is not None and y_test is not None:
        y_pred_test =  model.predict(X_test, verbose = 0)

    # calculate metrics
    for metric, metric_name in zip(metrics, metric_names):
        metric(y_train, y_pred_train.round())
        print(f'{metric_name} on training data: {metric(y_train, y_pred_train.round()):.3f}')
        print(f'{metric_name} on validation data: {metric(y_val, y_pred_val.round()):.3f}')
        if X_test is not None and y_test is not None:
            print(f'{metric_name} on test data: {metric(y_test, y_pred_test.round()):.3f}')
    return

#### Step 1: Read the data and have a look.  

In [4]:
data = pd.read_csv('ai_human_training_data.csv', index_col=0)
data['label'] = data['label'].astype(int)
data['text'] = data['text'].astype(str)

# make a copy of the original data
data_orig = data.copy()


print('Data shape:', data.shape)
# label = 1 for AI generated text, label = 0 for human written text
data.head()

Data shape: (2854, 2)


Unnamed: 0,text,label
0,Retsmedicinere har forgæves forsøgt at finde u...,0
1,Som politiker er jeg meget opmærksom på vigtig...,1
2,Som en enkeltmandsvirksomhed har jeg fleksible...,1
3,- Det er simpelthen for at kunne levere de ord...,0
4,Optræden af høje tindinger med årene er en nat...,0


In [None]:
# print the first 5 rows of the data
for i in range(0, 5):
    print("AI-generated text? ", bool(data['label'].iloc[i]))
    print("Text:")
    print_row(data, i, 'text', symbols_per_line=100)
    print("\n")


#### Step 2: Decide which model to use and download model and tokenizer.

We are going to use the multilingual BERT, which has been trained on >100 languages incl. Danish

For other models, see https://huggingface.co/transformers/v3.0.2/pretrained_models.html

In [5]:
# choose model
model_name = 'bert-base-multilingual-cased'

# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained(model_name, use_fast=True)
# Load pre-trained BERT model for binary classification
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### Step 3: Split data into training, validation and test set. 

In [6]:
# it takes quite a while to post-train the model, so we will use a subset of the data
use_all_samples = False
# number of samples to use
Nsamples = 1200

# only use a subset of the data if use_all_samples is False
if not use_all_samples:
    data = data_orig.copy()
    data = data.sample(Nsamples, random_state=42)
    
# Split data into training/validation and test sets
train_val_texts, test_texts, train_val_labels, test_labels = train_test_split(
    data['text'], data['label'], test_size=0.2, random_state=42
    )

# Split training/validation data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_val_texts, train_val_labels, test_size=0.2, random_state=42
    )

pretty_print_info_table(train_labels, val_labels, validation=True)
pretty_print_info_table(train_labels, test_labels, validation=False)


    Info table:
    +----------------+---------+---------+-------+
    |                | Training| Val.    | Total
    +----------------+---------+---------+-------+
    | Total samples  |     768 |     192 |     960 |
    | Pos. samples   |     364 |      95 |     459 |  (AI generated)
    | Neg. samples   |     404 |      97 |     501 |  (Human written)
    

    Info table:
    +----------------+---------+---------+-------+
    |                | Training| Test    | Total
    +----------------+---------+---------+-------+
    | Total samples  |     768 |     240 |    1008 |
    | Pos. samples   |     364 |     114 |     478 |  (AI generated)
    | Neg. samples   |     404 |     126 |     530 |  (Human written)
    


#### Step 4: Tokenize the text data

In [7]:
# Define the maximum length of the input text (number of tokens). 
# Increasing this value will increase the computational cost, but may improve model performance.
max_length_of_input = 128

# Tokenize the text inputs for BERT
def tokenize_texts(texts, max_len=max_length_of_input):
    return tokenizer(
        texts.tolist(), 
        padding=True, 
        truncation=True, 
        max_length=max_len, 
        return_tensors='tf'
    )

# Tokenize the text data
train_encodings = tokenize_texts(train_texts)
val_encodings = tokenize_texts(val_texts)
test_encodings = tokenize_texts(test_texts)

#### Step 5: Compile and post-train the model

NB: 
* It takes a while, so we keep the number of epochs small for the purpose of this exercise. 
* A large batch size takes a lot of RAM. You might consider looking at your memory usage during training to see how much you can crank it up (thus making training faster).

In [8]:
# set the central parameters for compiling and training
learning_rate = 2e-5
epochs = 2
batch_size = 48

# Convert labels to TensorFlow format
train_labels_tf = tf.convert_to_tensor(train_labels.values)
val_labels_tf = tf.convert_to_tensor(val_labels.values)
test_labels_tf = tf.convert_to_tensor(test_labels.values)

# Compile the model
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

# Train the model
history = model.fit(
    train_encodings['input_ids'],
    train_labels_tf,
    validation_data=(val_encodings['input_ids'], val_labels_tf),
    epochs=epochs,  
    batch_size=batch_size,
)


Epoch 1/2
Epoch 2/2


#### Step 6: Evaluate your model on the validation data. 

In [9]:
# Predict on the validation set
logits = model.predict(val_encodings['input_ids']).logits
predictions = np.argmax(logits, axis=1)

# Evaluate the model
accuracy = accuracy_score(val_labels, predictions)
report = classification_report(val_labels, predictions, target_names=['AI', 'Human'])

print(f"\nVal. Accuracy: {accuracy:.3f}")
print("\nClassification Report:\n", report)


Val. Accuracy: 0.792

Classification Report:
               precision    recall  f1-score   support

          AI       0.83      0.74      0.78        97
       Human       0.76      0.84      0.80        95

    accuracy                           0.79       192
   macro avg       0.79      0.79      0.79       192
weighted avg       0.80      0.79      0.79       192



#### Final step: Once you have completely finished training and evaluating your model, see how well it generalizes by testing it on the unseen test data

In [10]:
# Predict on the test set
logits = model.predict(test_encodings['input_ids']).logits
predictions = np.argmax(logits, axis=1)

# Evaluate the model
accuracy = accuracy_score(test_labels, predictions)
report = classification_report(test_labels, predictions, target_names=['AI', 'Human'])

print(f"\nTest accuracy: {accuracy:.3f}")
print("\nClassification Report:\n", report)


Test accuracy: 0.800

Classification Report:
               precision    recall  f1-score   support

          AI       0.88      0.71      0.79       126
       Human       0.74      0.89      0.81       114

    accuracy                           0.80       240
   macro avg       0.81      0.80      0.80       240
weighted avg       0.81      0.80      0.80       240



*************************************

# Exercises

### Exercise 1: 

Go through this notebook step by step and make sure you understand what we do and why

### Exercise 2: 

Can you tell which texts are AI-generated? Try it by running the cell below (and then type in 1 for AI and 0 for human, and enter to see a new text)


In [None]:
Ntexts = 15

# choose Ntexts random indices
random_indices = np.random.choice(data.shape[0], Ntexts, replace=False)

# get the actual labels
true_labels = data['label'].iloc[random_indices].values
# make a list to hold user guesses
your_guesses = []

# print the texts corresponding to the random indices, and make the user guess if the text is AI-generated or human-written
for i, idx in enumerate(random_indices):
    print(f"AI-generated text? Text {i+1}/{Ntexts}")
    print_row(data, idx, 'text', symbols_per_line=100, max_lines=5)
    print("\n")
    
    guess = input("1 if AI-generated, 0 if human-written: ")
    
    # keep asking until the input is valid
    while guess not in ['0', '1']:
        guess = input("Invalid input. Please enter 1 if AI-generated, 0 if human-written: ")

    your_guesses.append(int(guess))

# Evaluate the model
accuracy = accuracy_score(true_labels, np.array(your_guesses))

print("--------------------------------------------------------")
print(f"Congratulations! Your accuracy was: {accuracy * 100:.2f}%")
print("--------------------------------------------------------")



### Exercise 3: How accurate can you make your model?

The parameters used so far have been chosen simply to make training fast, and there is a lot of room for improvement! 

Try getting as high an accurary as possible by making adjustments and tweaks as you see fit.

$\textit{You might consider}$:
* changing the maximum allowed input length (i.e. maximum number of tokens in a text) before tokenization. We used max_length_of_input = 32 in the tokenization step, meaning that the model only trains the first 32 words/tokens of each text.
* increasing the number of texts (we only use 600 now, but there are 2854 in the CSV)
* increasing the number of epochs (we use just 1!)
* increasing the batch size (if your RAM can handle it)


