 # Step 2B: Apply Naive Classifier to Real Data



 Apply the trained classifier to real tweets to identify high-confidence examples:



 1. Process: Load real tweets, generate embeddings (all-MiniLM-L6-v2), predict classes

 2. Extract: Select top 5000 high-confidence examples per class

 3. Save:

    - High-confidence examples to 'data/high_confidence_real_examples.csv'

    - All examples with embeddings to 'data/all_real_examples_with_embeddings.csv'

 4. Visualize: UMAP projection of high-confidence embeddings



 This step helps identify reliable examples for training our final model.

In [None]:
# Import necessary libraries
import os
import pandas as pd
import numpy as np
from keras import models
import joblib
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import umap
import matplotlib.pyplot as plt


In [None]:
# Load real data
real_data = pd.read_csv('processed_sentiment140.csv')


In [None]:
# Load the trained model and label encoder
model = models.load_model('tweet_classifier.h5')
le = joblib.load('label_encoder.joblib')


In [None]:
# Load the sentence transformer model
ST_model = SentenceTransformer('all-MiniLM-L6-v2')


In [None]:
# Generate embeddings for the processed text
print('Generating embeddings for real data...')
batch_size = 32
embeddings = []
for i in tqdm(range(0, len(real_data), batch_size)):
    batch = real_data['cleaned_text'].iloc[i:i+batch_size].tolist()
    batch_embed = ST_model.encode(batch)
    embeddings.extend(batch_embed)
embeddings = np.array(embeddings)


In [None]:
# Predict probabilities for each class
print('Predicting class probabilities...')
preds_prob = model.predict(embeddings)
preds = np.argmax(preds_prob, axis=1)
pred_labels = le.inverse_transform(preds)


In [None]:
# Add predictions and probabilities to DataFrame
real_data['pred_label'] = pred_labels
real_data['prob_0'] = preds_prob[:, 0]
real_data['prob_1'] = preds_prob[:, 1]


In [None]:
# Add embeddings as a column (as list)
real_data['embedding'] = list(embeddings)


In [None]:
# Select top 5000 for each class by probability
literal_top = real_data[real_data['pred_label'] == le.classes_[0]].nlargest(5000, 'prob_0')
sarcastic_top = real_data[real_data['pred_label'] == le.classes_[1]].nlargest(5000, 'prob_1')
high_conf_examples = pd.concat([literal_top, sarcastic_top], ignore_index=True)
high_conf_examples['high_conf_class'] = high_conf_examples['pred_label']


In [None]:
# Save high confidence examples
output_high_conf = 'high_confidence_real_examples.csv'
high_conf_examples[['text', 'cleaned_text', 'pred_label', 'prob_0', 'prob_1', 'high_conf_class', 'embedding']].to_csv(output_high_conf, index=False)
print(f'High-confidence examples saved to {output_high_conf}')


In [None]:
# UMAP visualization of high-confidence embeddings

print("Reducing high-confidence embeddings to 2D with UMAP...")
umap_model = umap.UMAP(n_components=2, random_state=42)
embeddings_2d = umap_model.fit_transform(np.vstack(high_conf_examples['embedding'].values))

# Add 2D coordinates to DataFrame
high_conf_examples['umap_x'] = embeddings_2d[:, 0]
high_conf_examples['umap_y'] = embeddings_2d[:, 1]

# Plot
plt.figure(figsize=(10, 7))
for label in high_conf_examples['high_conf_class'].unique():
    subset = high_conf_examples[high_conf_examples['high_conf_class'] == label]
    plt.scatter(subset['umap_x'], subset['umap_y'], label=label, alpha=0.5, s=10)
plt.legend()
plt.title('UMAP projection of high-confidence all-MiniLM-L6-v2 embeddings')
plt.xlabel('UMAP-1')
plt.ylabel('UMAP-2')
plt.tight_layout()
plt.show()
