This notebook was originally Dr. Ringel's, and we will be repurposing it for use as part of our Major Class Project. For use, please use T4 GPU runtime type (which can be specified in top right corner), download the "complete_training_data.csv" and "validation_hand_labeled_cleaned.csv" files, and load them into the runtime. Additionally, we will not be using synthetic data, but real restaurant reviews labeled by Chat GPT.

Bernie Chen and Nahum Yared

# Creating Synthetic Experts with Generative AI
> ## Train Synthetic Expert on AI labeled texts (Service Quality Dimensions)

(Original)
Version 1.0  
Date: September 2, 2023    
Author: Daniel M. Ringel    
Contact: dmr@unc.edu

(Edited)   
Date: April 27th. 2024   

*Daniel M. Ringel, Creating Synthetic Experts with Generative Artificial Intelligence (July 15, 2023).  
Available at SSRN: https://papers.ssrn.com/abstract_id=4542949*


# 1. Imports

In [None]:
# Install for any modules not currently downloaded
!pip install transformers[torch]
!pip install accelerate -U
!pip install krippendorff
!pip install datasets



In [None]:
import pandas as pd
import numpy as np, warnings
import random
import re
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
import krippendorff
import torch
from transformers import TrainingArguments, Trainer, EvalPrediction, AutoConfig, AutoTokenizer, AutoModelForSequenceClassification, IntervalStrategy
from datasets import Dataset, DatasetDict
from datetime import datetime
from bs4 import BeautifulSoup
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')

# 2. Configure

In [None]:
# Paths and Filenames
# Please copy and paste the filepath to the "complete_training_data.csv" file on your machine
filepath_to_data = "/content/complete_training_data.csv"

# Copy and paste directory to store model
Training_Path = "/content"

# Set Controls
P = 95   # percentile for max tokens
T = 0.2  # size of test split for training
seed = 44 # seed used everywhere

# Pre-Trained LLM to fine-tune
# ---> Select from thousands at: https://huggingface.co/models and "plug-in" alternative model name
pretrained = 'DistilRoBERTa-base'

# Set basic Hyperparameters for training (classifier performance can vary with different parameter settings)
hyperparameters =  {'learning_rate': 6.7e-06,
                    'per_device_train_batch_size': 16,
                    'weight_decay': 1.1e-05,
                    'num_train_epochs': 3,
                    'warmup_steps': 500}

In [None]:
print(f"PyTorch version: {torch.__version__}")
device = "mps" if "backends" in dir(torch) and hasattr(torch.backends, 'mps') and torch.backends.mps.is_built() and torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu"
if device == "cpu": print("No GPU found, using >>> CPU <<< for training, which will be slow.")
else: print(f"GPU available! Using >>> {device} <<< for training")

PyTorch version: 2.2.1+cu121
GPU available! Using >>> cuda <<< for training


# 3. Helper Functions

In [None]:
def get_tokens(text):
    """Tokenize text (provided tokenizer is instantiated)"""
    return len(tokenizer(text)['input_ids'])

def compute_percentile(split, P):
    """Compute Pth percentile of number of tokens in texts of a given split"""
    num_tokens = [get_tokens(dataset[split][i]["text"]) for i in range(len(dataset[split]))]
    return np.percentile(num_tokens, P)

def preprocess(examples, max_tokens):
    """Encode texts with labels for training"""
    text = examples["text"]
    encoding = tokenizer(text, padding="max_length", truncation=True, max_length=max_tokens)
    relevant_keys = set(examples.keys()) & set(labels)
    labels_matrix = np.zeros((len(text), len(labels)))
    for idx, label in enumerate(labels):
        if label in relevant_keys:
            labels_matrix[:, idx] = examples[label]
    encoding["labels"] = labels_matrix.tolist()
    return encoding

def multi_label_metrics(predictions: np.array, labels: np.array, threshold: float = 0.5) -> dict:
    """
    Calculate classification metrics for multi-label classification.
    :param predictions: The raw output predictions from the model.
    :param labels: The ground truth labels.
    :param threshold: The threshold for converting probabilities to binary predictions.
    :return: A dictionary containing precision, recall, F1 score, ROC AUC score, and Krippendorff's alpha.
    """
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    y_pred = (probs >= threshold).numpy().astype(int)
    av = "micro"
    metrics = {
        'precision': precision_score(y_true=labels, y_pred=y_pred, average=av),
        'recall': recall_score(y_true=labels, y_pred=y_pred, average=av),
        'f1': f1_score(y_true=labels, y_pred=y_pred, average=av),
        'roc_auc': roc_auc_score(y_true=labels, y_score=probs, average=av),
        'krippendorff_alpha': krippendorff.alpha(reliability_data=np.vstack((labels.ravel(), y_pred.ravel())))
    }
    return metrics

def compute_metrics(eval_prediction: EvalPrediction) -> dict:
    """
    Wrapper function for computing multi-label metrics using EvalPrediction object.
    """
    preds = eval_prediction.predictions[0] if isinstance(eval_prediction.predictions, tuple) else eval_prediction.predictions
    return multi_label_metrics(predictions=preds, labels=eval_prediction.label_ids)

def seed_everything(seed = 42):
    """Seed everything for replicability. Largely works (especially on cuda, but not so much on Apple silicone (mps))"""
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if device == "cuda":
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True

# 4. Load and Prepare Data

In [None]:
# Load Training Data
TrainSample = pd.read_csv(f"{filepath_to_data}")[["text", "Reliability", "Tangibility", "Empathy", "Responsiveness", "Assurance"]].reset_index(drop=True)
TrainSample.index.name = "ID"
TrainSample

Unnamed: 0_level_0,text,Reliability,Tangibility,Empathy,Responsiveness,Assurance
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,I am saddened that I have to give this restaur...,True,False,False,False,False
1,Come and get it!! My first visit. I ordered th...,True,False,True,False,False
2,Best soul food in town. If you tired of eating...,True,False,False,False,False
3,I would leave 0 stars if I could drove 30 mins...,True,False,False,False,False
4,My first time eating here most definitely won’...,False,True,True,False,False
...,...,...,...,...,...,...
6581,Peaceful! Great Service! Always delicious food...,False,True,True,False,False
6582,My sister in law brought me to Landry's Restau...,False,True,False,False,True
6583,We love spending an afternoon on lake Pontchar...,True,False,True,False,False
6584,Great food & atmosphere,False,True,False,False,False


In [None]:
# Split the DataFrame into train and test sets, stratified by the minority label column
minority_label = TrainSample.iloc[:, 1:].sum().idxmin()
train, test = train_test_split(TrainSample, test_size=T, random_state=seed, stratify=TrainSample[minority_label])
train

Unnamed: 0_level_0,text,Reliability,Tangibility,Empathy,Responsiveness,Assurance
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3764,The Menudo is just delicious love it!!!,False,True,False,False,False
6432,I had hair in my rice,False,False,False,False,True
3822,"If you want to have wings, smoke vapor and lis...",True,False,False,True,False
1361,This place sucks they take there sweet damn ti...,False,False,False,True,False
5072,This place its been closed the last days and i...,True,False,False,False,False
...,...,...,...,...,...,...
5715,Pamela was absolutely fabulous and so kind. Sh...,False,False,True,False,False
699,Great Food and prices.. Tasty Healthy and Vega...,False,True,False,False,False
6534,"Wow, My first and last time I ever dine here. ...",True,True,False,False,False
3242,Really enjoyed the service here. Friendly fast...,False,False,True,True,False


In [None]:
# Create HuggingFaces Dataset
dataset = DatasetDict({"train":Dataset.from_dict(train),"test":Dataset.from_dict(test)})

# Get Labels and create label dicts
labels = [label for label in dataset['train'].features.keys() if label not in ['ID', 'text']]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}

In [None]:
# Set Tokenizer
tokenizer = AutoTokenizer.from_pretrained(pretrained)

# Prohibit Paralell Tokenization (can lead to forking in loops and batch processing)
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

# Compute percentile for train and test splits (percentile for max tokens)
higher_percentile = max(compute_percentile('train',P), compute_percentile('test',P))

# Create encoded dataset
encoded_dataset = dataset.map(lambda examples: preprocess(examples, int(higher_percentile)), batched=True, remove_columns=dataset['train'].column_names)

# Set encoded dataset to pytorch tensors
encoded_dataset.set_format("torch")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (698 > 512). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/5268 [00:00<?, ? examples/s]

Map:   0%|          | 0/1318 [00:00<?, ? examples/s]

# 5. Set-up Fine-Tuning of LLM

In [None]:
# Seed Torch etc.
seed_everything(seed)

# Instantiate Classifier
    # ---> Note: You need to set "ignore_mismatched_sizes" to "True" if fine-tuning a pre-trained classification model with different class numbers
    # ---> You should get several warnings about weights of checkpoint not being used in initialization.
    #      This is expected since you will train the pretrained model on downstream task.
model = AutoModelForSequenceClassification.from_pretrained(pretrained,
                                                           problem_type="multi_label_classification",
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id,
                                                           ignore_mismatched_sizes=True)

model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DistilRoBERTa-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Set Training Arguments
training_args = TrainingArguments(
    output_dir=f"{Training_Path}",
    evaluation_strategy="epoch",
    logging_dir=f"{Training_Path}/Logs",
    logging_strategy="steps",
    logging_steps=10,
    per_device_train_batch_size=hyperparameters['per_device_train_batch_size'],
    per_device_eval_batch_size= hyperparameters['per_device_train_batch_size'],
    num_train_epochs=hyperparameters['num_train_epochs'],
    learning_rate=hyperparameters['learning_rate'],
    weight_decay=hyperparameters['weight_decay'],
    warmup_steps=hyperparameters['warmup_steps'],
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=2,
    use_mps_device=(device == "mps"),
    optim='adamw_torch',
    seed=seed
    # ---> You can also do a more granular evaluation than epochs at every 100 (or so) steps
    #evaluation_strategy=IntervalStrategy.STEPS,  # Evaluate every 'eval_steps'
    #eval_steps=100,                              # Evaluate every 100 steps
    #do_train=True,
    #do_eval=True,
    #save_strategy=IntervalStrategy.STEPS,        # Save every 'save_steps'
    #save_steps=100,                              # Save every 100 steps
)

# Instantiate Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
print("Ready to Create Synthetic Expert")

Ready to Create Synthetic Expert


# 6. Fine-Tune and Evaluate

In [None]:
# Fine-tune the model with trainer to create Synthetic Expert
print(f"Started training with seed {seed} at {datetime.now()}\nFine-tuning {pretrained}")
trainer.train()
print(f"Completed training at {datetime.now()}")

Started training with seed 44 at 2024-04-29 15:57:25.274059
Fine-tuning DistilRoBERTa-base


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Roc Auc,Krippendorff Alpha
1,0.5079,0.491371,0.72449,0.272031,0.395543,0.785887,0.253198
2,0.4254,0.438574,0.688584,0.52162,0.593585,0.82969,0.46272
3,0.4243,0.416773,0.706259,0.543514,0.61429,0.850507,0.488964


Completed training at 2024-04-29 16:01:46.252058


In [None]:
# Evaluate Synthetic Expert on test data
print("Model performance on Test")
trainer.evaluate()

Model performance on Test


{'eval_loss': 0.4167732298374176,
 'eval_precision': 0.7062588904694168,
 'eval_recall': 0.5435139573070608,
 'eval_f1': 0.6142901330034024,
 'eval_roc_auc': 0.8505070270619366,
 'eval_krippendorff_alpha': 0.48896447801868304,
 'eval_runtime': 6.2114,
 'eval_samples_per_second': 212.189,
 'eval_steps_per_second': 13.362,
 'epoch': 3.0}

In [None]:
# Evaluate the Synthetic Expert on our hand-validated reviews
# Upload "validation_hand_labeled_cleaned.csv" into runtime, copy down filepath, and read in with pandas
filepath_to_validation = "/content/validation_hand_labeled_cleaned.csv"
validation_data = pd.read_csv(filepath_to_validation)

# Format it similarly to our training and test data
# We'll use index as the ID column
# Drop unnecessary columns first
validation_data_processed = validation_data[["text", "reliability", "assurance", "responsiveness", "tangibility", "empathy"]].copy()

# Turn index into ID column
validation_data_processed.index.name = "ID"

# Rename boolean columns to match the format and ensure they are in boolean type
rename_columns = {
    'reliability': 'Reliability',
    'assurance': 'Assurance',
    'responsiveness': 'Responsiveness',
    'tangibility': 'Tangibility',
    'empathy': 'Empathy'
}
validation_data_processed.rename(columns=rename_columns, inplace=True)

# Change the label columns into booleans
for col in ['Reliability', 'Tangibility', 'Empathy', 'Responsiveness', 'Assurance']:
    validation_data_processed[col] = validation_data_processed[col].astype(bool)

In [None]:
# Similar to before, process the hand-validated data in the same way
# Create HuggingFaces Dataset
dataset = DatasetDict({"validation":Dataset.from_dict(validation_data_processed)})

# Get Labels and create label dicts
labels = [label for label in dataset['validation'].features.keys() if label not in ['ID', 'text']]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}

# Compute percentile for train and test splits (percentile for max tokens)
higher_percentile = compute_percentile('validation',P)

# Create encoded dataset
encoded_dataset_validation = dataset.map(lambda examples: preprocess(examples, int(higher_percentile)), batched=True, remove_columns=dataset['validation'].column_names)

# Set encoded dataset to pytorch tensors
encoded_dataset_validation.set_format("torch")

Map:   0%|          | 0/504 [00:00<?, ? examples/s]

In [None]:
# Now evaluate
print("Model performance on Hand-Evaluated Validation Sets")
trainer.eval_dataset = encoded_dataset_validation["validation"]
trainer.evaluate()

Model performance on Hand-Evaluated Validation Sets


{'eval_loss': 0.6615386605262756,
 'eval_precision': 0.3187855787476281,
 'eval_recall': 0.25112107623318386,
 'eval_f1': 0.28093645484949836,
 'eval_roc_auc': 0.6307369102791769,
 'eval_krippendorff_alpha': 0.057398229965302305,
 'eval_runtime': 2.7833,
 'eval_samples_per_second': 181.083,
 'eval_steps_per_second': 11.497,
 'epoch': 3.0}

In [None]:
# Evaluate Synthetic Expert on train data
print("Model performance on Train")
trainer.eval_dataset = encoded_dataset["train"]
trainer.evaluate()

Model performance on Train


{'eval_loss': 0.38728752732276917,
 'eval_precision': 0.7549599721545422,
 'eval_recall': 0.5882831570382425,
 'eval_f1': 0.6612804878048781,
 'eval_roc_auc': 0.8792202195234897,
 'eval_krippendorff_alpha': 0.5489533573577646,
 'eval_runtime': 24.9548,
 'eval_samples_per_second': 211.101,
 'eval_steps_per_second': 13.224,
 'epoch': 3.0}

# 7. Save Synthetic Expert

In [None]:
# Save fine-tuned model (only run if necessary)
trainer.save_model(f"{Training_Path}/synthetic_expert_serivce_dimensions")
print("Your Synthetic Expert was saved! If you use this notebook's code, please give credit to the author by citing the paper:\n\nDaniel M. Ringel, Creating Synthetic Experts with Generative Artificial Intelligence (July 15, 2023). Available at SSRN: https://papers.ssrn.com/abstract_id=4542949")

Your Synthetic Expert was saved! If you use this notebook's code, please give credit to the author by citing the paper:

Daniel M. Ringel, Creating Synthetic Experts with Generative Artificial Intelligence (July 15, 2023). Available at SSRN: https://papers.ssrn.com/abstract_id=4542949


# 8. Load and Use Synthetic Expert to label a review

In [None]:
# Helper functions
def clean_and_parse_review(review):
  # Parse similarly to Dr. Ringel's Tweet classifier
  review = re.sub(r"https?://\S+|www\.\S+", " URL ", review)
  parsed = BeautifulSoup(review, "html.parser").get_text() if "filename" not in str(BeautifulSoup(review, "html.parser")) else None
  return re.sub(r" +", " ", re.sub(r'^[.:]+', '', re.sub(r"\\n+|\n+", " ", parsed or review)).strip()) if parsed else None

def predict_review(review, model, tokenizer, device, threshold=0.5):
  # Predict a review's probability of each category, and classify it based on threshold
  inputs = tokenizer(review, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
  probs = torch.sigmoid(model(**inputs).logits).detach().cpu().numpy()[0]
  return probs, [id2label[i] for i, p in enumerate(probs) if id2label[i] in {"Reliability", "Tangibility", "Empathy", "Responsiveness", "Assurance"} and p >= 0.45]

In [None]:
# Define a review
review = "The atmosphere makes up for the pricing if you have plenty of time to spend or go in with friends on a game night/ weekend night."

In [None]:
# Load the fine-tuned model
model_path = f"{Training_Path}/synthetic_expert_serivce_dimensions"
model = AutoModelForSequenceClassification.from_pretrained(model_path).to(device)
id2label = model.config.id2label

In [None]:
# Clean, predict, and print output
cleaned_review = clean_and_parse_review(review)
probs, labels = predict_review(cleaned_review, model, tokenizer, device)

# Print labels and proababilities
print(labels, probs)

['Tangibility'] [0.16189757 0.8847536  0.1832639  0.10419562 0.11512625]


  parsed = BeautifulSoup(review, "html.parser").get_text() if "filename" not in str(BeautifulSoup(review, "html.parser")) else None
