 # Step 2A2: Train Naive Classifier



 Train a neural network classifier on synthetic tweets using a two-phase approach:



 1. Data: Load synthetic dataset, generate embeddings (all-MiniLM-L6-v2), split 80/20

 2. Training:

    - Phase 1: Find optimal epochs with early stopping (10% validation)

    - Phase 2: Full training with best epochs

    - Architecture: Dense(64) → Dropout(0.5) → Dense(32) → Dropout(0.5) → Dense(2)

 3. Evaluation: Classification metrics, confusion matrix, example predictions

 4. Save: Model and label encoder to './models/'



 The classifier will help identify label inconsistencies in the original dataset.

In [None]:
# Import necessary libraries
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import keras
from keras import layers, models, optimizers, callbacks
from sentence_transformers import SentenceTransformer


In [None]:
# Load the data
print("Loading data...")
data = pd.read_csv('synthetic_tweets.csv')


In [None]:
# Load sentence transformer model
print("Loading sentence transformer model...")
ST_model = SentenceTransformer('all-MiniLM-L6-v2')


In [None]:
# Generate embeddings for the text
print("Generating text embeddings...")
batch_size = 32
embeddings = []

for i in tqdm(range(0, len(data), batch_size)):
    batch = data['text'].iloc[i:i+batch_size].tolist()
    batch_embed = ST_model.encode(batch)
    embeddings.extend(batch_embed)

# Convert embeddings to numpy array
X = np.array(embeddings)

# Convert labels to numeric values
le = LabelEncoder()
y = le.fit_transform(data['label'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [None]:
# 1. Train with validation and early stopping to find best number of epochs
print("Finding best number of epochs with early stopping...")
model_val = models.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X.shape[1],)),
    layers.Dropout(0.5),
    layers.Dense(32, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(2, activation='softmax')
])
model_val.compile(
    optimizer=optimizers.Adam(learning_rate=1e-3),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)
early_stop = callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history_val = model_val.fit(
    X_train, y_train,
    epochs=50,
    batch_size=32,
    validation_split=0.1,
    callbacks=[early_stop],
    verbose=1
)
# Find the best epoch
best_epoch = np.argmin(history_val.history['val_loss']) + 1
print(f"Best number of epochs found: {best_epoch}")

# Plot training and validation history after initial model
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history_val.history['accuracy'], label='Training Accuracy')
plt.plot(history_val.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy (Validation Stage)')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history_val.history['loss'], label='Training Loss')
plt.plot(history_val.history['val_loss'], label='Validation Loss')
plt.title('Model Loss (Validation Stage)')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.tight_layout()
plt.show()

# 2. Retrain on the full training set for the best number of epochs, no validation
print(f"Retraining on full training set for {best_epoch} epochs...")
model = models.Sequential([
    layers.Dense(32, activation='relu', input_shape=(X.shape[1],)),
    layers.Dropout(0.5),
    layers.Dense(16, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(2, activation='softmax')  # 2 classes: literal and sarcastic
])
model.compile(
    optimizer=optimizers.Adam(learning_rate=1e-3),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)
history = model.fit(
    X_train, y_train,
    epochs=best_epoch,
    batch_size=32,
    verbose=1
)

# Plot only training metrics after retraining on full set
plt.figure(figsize=(8, 4))
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['loss'], label='Training Loss')
plt.title('Final Model Training Metrics')
plt.xlabel('Epoch')
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
model.summary()

In [None]:
# Evaluate the model
print("Evaluating model...")
y_pred_prob = model.predict(X_test)
y_pred = np.argmax(y_pred_prob, axis=1)

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))


In [None]:
# Plot confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=le.classes_,
            yticklabels=le.classes_)
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label');


In [None]:
# Save the model and label encoder
print("Saving model and label encoder...")
model.save('tweet_classifier.h5')
import joblib
joblib.dump(le, 'label_encoder.joblib')
print("Model and encoder saved successfully!")


In [None]:
# Display some example predictions
print("\nExample Predictions:")
test_examples = data.sample(n=5, random_state=42)
test_embeddings = ST_model.encode(test_examples['text'].tolist())
test_predictions_prob = model.predict(test_embeddings)
test_predictions = np.argmax(test_predictions_prob, axis=1)
test_predictions = le.inverse_transform(test_predictions)

for text, true_label, pred_label, pred_prob in zip(
    test_examples['text'],
    test_examples['label'],
    test_predictions,
    test_predictions_prob
):
    print(f"\nText: {text}")
    print(f"True label: {true_label}")
    print(f"Predicted label: {pred_label}")
    print(f"Prediction probability: {pred_prob[np.argmax(pred_prob)]:.3f}")
    print("-" * 80)