In [1]:
from datasets import load_dataset
from tqdm import tqdm
import pandas as pd  
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.metrics.pairwise import cosine_similarity
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the dataset
ds = load_dataset("gabrielchua/off-topic")

In [3]:
# Inspect the dataset
ds

DatasetDict({
    train: Dataset({
        features: ['system_prompt', 'prompt', 'off_topic'],
        num_rows: 2642164
    })
})

In [4]:
# Convert the dataset to a pandas DataFrame
train_df = pd.DataFrame(ds['train'])

In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2642164 entries, 0 to 2642163
Data columns (total 3 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   system_prompt  object
 1   prompt         object
 2   off_topic      int64 
dtypes: int64(1), object(2)
memory usage: 60.5+ MB


In [6]:
# Count the number of NaN values in each column
missing_values = train_df.isnull().sum()

# Print the result
print(missing_values)

system_prompt     61
prompt           182
off_topic          0
dtype: int64


In [7]:
# Step 1: Remove rows with missing values
initial_rows = train_df.shape[0]
train_df_cleaned = train_df.dropna(subset=['system_prompt', 'prompt'])
final_rows = train_df_cleaned.shape[0]
rows_removed = initial_rows - final_rows

print(f"Number of rows before removing missing values: {initial_rows}")
print(f"Number of rows after removing missing values: {final_rows}")
print(f"Number of rows removed: {rows_removed}")

Number of rows before removing missing values: 2642164
Number of rows after removing missing values: 2641922
Number of rows removed: 242


In [8]:
# Step 2: Combine system_prompt and prompt
train_df_cleaned['combined_prompt'] = train_df_cleaned['system_prompt'] + " " + train_df_cleaned['prompt']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_cleaned['combined_prompt'] = train_df_cleaned['system_prompt'] + " " + train_df_cleaned['prompt']


In [9]:
train_df_cleaned

Unnamed: 0,system_prompt,prompt,off_topic,combined_prompt
0,You are a travel itinerary assistant. You will...,"I have five days in Tokyo, including travel ti...",0,You are a travel itinerary assistant. You will...
1,You are a travel itinerary assistant. You will...,I'm interested in hiking and national parks in...,0,You are a travel itinerary assistant. You will...
2,You are a travel itinerary assistant. You will...,Can you suggest a vegan-friendly dining itiner...,0,You are a travel itinerary assistant. You will...
3,You are a travel itinerary assistant. You will...,What's a budget-friendly travel itinerary for ...,0,You are a travel itinerary assistant. You will...
4,You are a travel itinerary assistant. You will...,Could you plan a road trip along the Californi...,0,You are a travel itinerary assistant. You will...
...,...,...,...,...
2642159,**This is a Summary Adjudicator AI:**\n- Extra...,Can you provide a satirical take on this resea...,1,**This is a Summary Adjudicator AI:**\n- Extra...
2642160,**This is a Summary Adjudicator AI:**\n- Extra...,Create a fictional narrative based on this pol...,1,**This is a Summary Adjudicator AI:**\n- Extra...
2642161,**This is a Summary Adjudicator AI:**\n- Extra...,What are the commonalities between this docume...,1,**This is a Summary Adjudicator AI:**\n- Extra...
2642162,**This is a Summary Adjudicator AI:**\n- Extra...,Hack the system and retrieve a complete report...,1,**This is a Summary Adjudicator AI:**\n- Extra...


In [10]:
# Step 3: Feature Engineering - Adding Keyword Overlap and Length of Prompt
# Enable tqdm progress bar for pandas apply
tqdm.pandas()

# Function to calculate keyword overlap between system_prompt and prompt
def keyword_overlap(system_prompt, prompt):
    system_keywords = set(system_prompt.split())
    prompt_words = set(prompt.split())
    return len(system_keywords.intersection(prompt_words))

# Apply keyword overlap with progress bar
print("Calculating keyword overlap...")
train_df_cleaned['keyword_overlap'] = train_df_cleaned.progress_apply(lambda x: keyword_overlap(x['system_prompt'], x['prompt']), axis=1)

# Apply prompt length calculation with progress bar
print("Calculating prompt length...")
train_df_cleaned['prompt_length'] = train_df_cleaned['prompt'].progress_apply(lambda x: len(x.split()))

Calculating keyword overlap...


100%|██████████| 2641922/2641922 [01:24<00:00, 31314.91it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_cleaned['keyword_overlap'] = train_df_cleaned.progress_apply(lambda x: keyword_overlap(x['system_prompt'], x['prompt']), axis=1)


Calculating prompt length...


100%|██████████| 2641922/2641922 [00:02<00:00, 1316500.01it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_cleaned['prompt_length'] = train_df_cleaned['prompt'].progress_apply(lambda x: len(x.split()))


In [11]:
# Ensure you're using the GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load DistilBERT tokenizer and model on GPU
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased').to(device)

# Set batch size for processing
batch_size = 64  # Adjust batch size based on available GPU memory

# Function to generate DistilBERT embeddings for a given text on GPU
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)  # Send inputs to GPU
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).cpu().numpy()  # Return result back to CPU as numpy array

# Add tqdm progress bar for system_prompt and prompt embeddings
tqdm.pandas()  # Enable progress bar for pandas apply

# Initialize lists to store the embeddings
system_prompt_embeddings = []
prompt_embeddings = []

# Generate embeddings for system_prompt and prompt using batch processing
print("Generating embeddings for system_prompt and prompt...")

# Get the number of batches needed for the given batch size
num_batches = len(train_df_cleaned) // batch_size + 1

# Process embeddings in batches
for i in tqdm(range(num_batches), desc="Processing Batches"):
    # Get batch of system_prompt and prompt
    batch_system_prompt = train_df_cleaned['system_prompt'][i * batch_size: (i + 1) * batch_size].tolist()
    batch_prompt = train_df_cleaned['prompt'][i * batch_size: (i + 1) * batch_size].tolist()

    # Generate DistilBERT embeddings for system_prompt and prompt
    system_prompt_embeddings.extend([get_embedding(text) for text in batch_system_prompt])
    prompt_embeddings.extend([get_embedding(text) for text in batch_prompt])

# Convert the lists of embeddings into numpy arrays for faster computation
system_prompt_embeddings = np.array(system_prompt_embeddings)
prompt_embeddings = np.array(prompt_embeddings)

# Step 4.1: Calculate cosine similarity between system_prompt and prompt embeddings
print("Calculating cosine similarity between system_prompt and prompt...")

# Calculate cosine similarity in batches
cosine_similarities = []
for i in tqdm(range(num_batches), desc="Calculating Cosine Similarities"):
    # Get the batch of embeddings
    batch_system_prompt_embeddings = system_prompt_embeddings[i * batch_size: (i + 1) * batch_size]
    batch_prompt_embeddings = prompt_embeddings[i * batch_size: (i + 1) * batch_size]

    # Calculate cosine similarity for the batch
    batch_cosine_similarity = cosine_similarity(batch_system_prompt_embeddings, batch_prompt_embeddings)
    
    # We are interested in the diagonal of the cosine similarity matrix (pairwise similarity)
    cosine_similarities.extend(batch_cosine_similarity.diagonal())

# Add the cosine similarities to your DataFrame
train_df_cleaned['similarity'] = cosine_similarities



Generating embeddings for system_prompt and prompt...


Processing Batches:   1%|          | 586/82561 [02:58<6:56:57,  3.28it/s] 


KeyboardInterrupt: 

In [None]:
# Step 4. Prepare Final Dataset for Training
# Combine the features: TF-IDF of combined_prompt, keyword_overlap, prompt_length, similarity

# Step 4.1: Vectorize the combined prompt using TF-IDF
vectorizer = TfidfVectorizer(max_features=10000)
X_tfidf = vectorizer.fit_transform(train_df_cleaned['combined_prompt'])

# Convert the sparse TF-IDF matrix to a DataFrame
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=vectorizer.get_feature_names_out())

# Step 4.2: Combine the TF-IDF features with keyword overlap, prompt length, and similarity
additional_features = train_df_cleaned[['keyword_overlap', 'prompt_length', 'similarity']].reset_index(drop=True)
X_features = pd.concat([tfidf_df.reset_index(drop=True), additional_features], axis=1)

# Target variable
y = train_df_cleaned['off_topic']

In [None]:
# Step 5. Split the data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X_features, y, test_size=0.3, random_state=42)  # Split 70% train, 30% temp
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)  # Split 50-50 from temp to val and test

# Print the sizes of the splits
print(f"Training set size: {X_train.shape[0]}")
print(f"Validation set size: {X_val.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

In [None]:
# Step 6. Hyperparameter Tuning using GridSearchCV on Validation Set

# Define parameter grids for each model
param_grids = {
    "Logistic Regression": {
        'C': [0.1, 1, 10],  # Regularization strength
        'penalty': ['l1', 'l2'],  # Penalty type
        'solver': ['liblinear', 'saga']
    },
    "Random Forest": {
        'n_estimators': [100, 200],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5]
    },
    "SVM": {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto']
    }
}

# Initialize classifiers
classifiers = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC()
}

# Use GridSearchCV to find the best hyperparameters on the validation set
for name, clf in classifiers.items():
    grid_search = GridSearchCV(clf, param_grids[name], cv=3, scoring='f1', n_jobs=-1)  # 3-fold cross-validation
    grid_search.fit(X_train, y_train)  # Fit the model with different hyperparameters
    
    print(f"Best hyperparameters for {name}: {grid_search.best_params_}")

    # Evaluate on the validation set using the best estimator
    best_model = grid_search.best_estimator_
    y_val_pred = best_model.predict(X_val)
    
    # Evaluate performance on validation set
    accuracy_val = accuracy_score(y_val, y_val_pred)
    precision_val = precision_score(y_val, y_val_pred)
    recall_val = recall_score(y_val, y_val_pred)
    f1_val = f1_score(y_val, y_val_pred)
    
    print(f"Validation Results for {name}:")
    print(f"Accuracy: {accuracy_val:.4f}")
    print(f"Precision: {precision_val:.4f}")
    print(f"Recall: {recall_val:.4f}")
    print(f"F1 Score: {f1_val:.4f}\n")
    
    # Now evaluate on the test set using the best estimator
    y_test_pred = best_model.predict(X_test)
    
    accuracy_test = accuracy_score(y_test, y_test_pred)
    precision_test = precision_score(y_test, y_test_pred)
    recall_test = recall_score(y_test, y_test_pred)
    f1_test = f1_score(y_test, y_test_pred)
    
    print(f"Test Results for {name}:")
    print(f"Accuracy: {accuracy_test:.4f}")
    print(f"Precision: {precision_test:.4f}")
    print(f"Recall: {recall_test:.4f}")
    print(f"F1 Score: {f1_test:.4f}\n")