 # Predict at Scale



 This script:

 - Loads the full processed Sentiment140 dataset

 - Randomly samples 100 tweets for prediction

 - Encodes these 100 examples in batches using the all-mpnet-base-v2 sentence transformer

 - Loads the final trained classifier model

 - Predicts probabilities for each example

 - Applies a threshold (0.5) to classify as 'sarcastic' or 'literal'

 - Shows class distribution and random sample of predictions

 - (Optionally) saves predictions to CSV

 - Uses the GPT API to classify each tweet in the sample

 - Compares model predictions to LLM labels and reports accuracy, TP, TN, FP, FN rates

In [11]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from keras.models import load_model
import openai
import json
import time
from google.colab import userdata # Import userdata to access secrets


In [14]:
# Load the full dataset
full_df = pd.read_csv('processed_sentiment140.csv')

# Select a random sample of 100 examples for prediction
predict_df = full_df.sample(n=100, random_state=42).copy()


In [3]:
# Load the same sentence transformer model used in training
ST_model = SentenceTransformer('all-mpnet-base-v2')


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [15]:
# Encode the examples in batches
batch_size = 32
all_embeddings = []
print('Encoding 100 examples in batches...')
for i in tqdm(range(0, len(predict_df), batch_size)):
    batch = predict_df['cleaned_text'].iloc[i:i+batch_size].tolist()
    batch_embed = ST_model.encode(batch)
    all_embeddings.extend(batch_embed)
all_embeddings = np.array(all_embeddings)


Encoding 100 examples in batches...


100%|██████████| 4/4 [00:00<00:00, 11.72it/s]


In [16]:
# Load the final trained model
final_model = load_model('final_classifier.h5')




In [17]:
# Predict probabilities
probs = final_model.predict(all_embeddings)

# Apply threshold to classify
threshold = 0.5
predict_df['predicted_probability'] = probs.flatten()
predict_df['predicted_label'] = (probs.flatten() > threshold).astype(int)

# Map back to string labels if needed (optional)
label_map = {1: 'sarcastic', 0: 'literal'}
predict_df['predicted_label_str'] = predict_df['predicted_label'].map(label_map)


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step


In [18]:
# Show a sample of predictions
print(predict_df[['text', 'predicted_probability', 'predicted_label', 'predicted_label_str']].head())

# Show statistics of predicted classes
print("\nPredicted class distribution:")
print(predict_df['predicted_label_str'].value_counts())


                                                     text  \
725031  Was sending my last tweet typed in moon and it...   
352959  In Tha House, Sore From Working Out...Kinda Mi...   
601371  @Queluver55 just wanted to say hi and that her...   
425948  TGIF but then its TGIS then oh man its Sunday....   
567048  in time you will get it young grass hopper@nat...   

        predicted_probability  predicted_label predicted_label_str  
725031               0.431567                0             literal  
352959               0.034415                0             literal  
601371               0.001978                0             literal  
425948               0.804007                1           sarcastic  
567048               0.707501                1           sarcastic  

Predicted class distribution:
predicted_label_str
literal      73
sarcastic    27
Name: count, dtype: int64


In [19]:
# Optionally, save predictions to a new CSV
predict_df.to_csv('predictions_100.csv', index=False)
print('Saved predictions for 100 examples to predictions_100.csv')


Saved predictions for 100 examples to predictions_100.csv


In [20]:
# Load OpenAI API key from Colab secrets
openai.api_key = userdata.get('OpenAI_API_Key')

# Initialize OpenAI client
client = openai.OpenAI(api_key=openai.api_key)

def classify_with_llm(text, max_retries=3, wait_time=0.5):
    prompt = (
        "Classify the following tweet as either 'literal' or 'sarcastic'. "
        "Respond with only one word: literal or sarcastic.\n"
        f"Tweet: {text}\nLabel:"
    )
    for attempt in range(max_retries):
        try:
            response = client.chat.completions.create(
                model="gpt-4.1-mini",
                messages=[
                    {"role": "system", "content": "You are a helpful assistant that classifies tweets as literal or sarcastic. Respond with only one word: literal or sarcastic."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0,
                max_tokens=1
            )
            label = response.choices[0].message.content.strip().lower()
            if 'lit' in label:
                return 'literal'
            elif 'sar' in label:
                return 'sarcastic'
            else:
                return label
        except Exception as e:
            if attempt < max_retries - 1:
                print(f"Attempt {attempt + 1} failed: {str(e)}. Retrying in {wait_time} seconds...")
                time.sleep(wait_time)
                wait_time *= 2
            else:
                print(f"All {max_retries} attempts failed. Last error: {str(e)}")
                return None


In [21]:
# Apply LLM classification to the 100 samples (this will use tokens and take time)
llm_labels = []
for text in tqdm(predict_df['text'], desc='Classifying with LLM'):
    llm_labels.append(classify_with_llm(text))
predict_df['llm_label'] = llm_labels


Classifying with LLM: 100%|██████████| 100/100 [01:13<00:00,  1.36it/s]


In [22]:
# Compare model predictions with LLM labels
def get_tp_tn(row):
    if row['predicted_label_str'] == 'sarcastic' and row['llm_label'] == 'sarcastic':
        return 'TP'
    elif row['predicted_label_str'] == 'literal' and row['llm_label'] == 'literal':
        return 'TN'
    elif row['predicted_label_str'] == 'sarcastic' and row['llm_label'] == 'literal':
        return 'FP'
    elif row['predicted_label_str'] == 'literal' and row['llm_label'] == 'sarcastic':
        return 'FN'
    else:
        return 'Other'

predict_df['llm_agreement'] = predict_df.apply(get_tp_tn, axis=1)


In [23]:
# Report accuracy and confusion matrix
accuracy = (predict_df['predicted_label_str'] == predict_df['llm_label']).mean()
tp = (predict_df['llm_agreement'] == 'TP').sum()
tn = (predict_df['llm_agreement'] == 'TN').sum()
fp = (predict_df['llm_agreement'] == 'FP').sum()
fn = (predict_df['llm_agreement'] == 'FN').sum()
print(f'LLM agreement accuracy: {accuracy:.3f}')
print(f'TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}')


LLM agreement accuracy: 0.720
TP: 17, TN: 55, FP: 10, FN: 18
