 # Step 3: Build Balanced Training Dataset



 Combine synthetic and real data into a balanced dataset:



 1. Positive (Sarcastic):

    - Synthetic sarcastic tweets

    - Real true positives (TP)

    - Real false negatives (FN)



 2. Negative (Literal):

    - Synthetic literal tweets

    - Real true negatives (TN)

    - Real false positives (FP)



 The script checks for class imbalance (>20%) and saves a shuffled dataset to 'data/balanced_training_dataset.csv'.

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np


In [3]:
# Load synthetic data
synthetic = pd.read_csv('synthetic_tweets.csv')


In [5]:
# Load LLM-refined real data (with TP, TN, FP, FN labels)
llm_refined = pd.read_csv('llm_refined_labels.csv')


In [6]:
# Identify positive and negative classes in synthetic data
synthetic_pos = synthetic[synthetic['label'] == 'sarcastic']
synthetic_neg = synthetic[synthetic['label'] == 'literal']


In [7]:
# Identify TP, TN, FP, FN in real data (from LLM validation)
real_tp = llm_refined[llm_refined['llm_agreement'] == 'TP']  # True sarcastic
real_tn = llm_refined[llm_refined['llm_agreement'] == 'TN']  # True literal
real_fn = llm_refined[llm_refined['llm_agreement'] == 'FN']  # Missed sarcastic (model said literal, LLM said sarcastic)
real_fp = llm_refined[llm_refined['llm_agreement'] == 'FP']  # Missed literal (model said sarcastic, LLM said literal)


In [9]:
# Add source column to each group
synthetic_pos = synthetic_pos.copy()
synthetic_pos['source'] = 'synthetic-sarcastic'
synthetic_neg = synthetic_neg.copy()
synthetic_neg['source'] = 'synthetic-literal'

real_tp = real_tp.copy()
real_tp['source'] = 'real-sarcastic-TP'
real_fn = real_fn.copy()
real_fn['source'] = 'real-sarcastic-FN'
real_tn = real_tn.copy()
real_tn['source'] = 'real-literal-TN'
real_fp = real_fp.copy()
real_fp['source'] = 'real-literal-FP'

# Add final label
synthetic_pos['label'] = 'sarcastic'
real_tp['label'] = 'sarcastic'
real_fn['label'] = 'sarcastic'
synthetic_neg['label'] = 'literal'
real_tn['label'] = 'literal'
real_fp['label'] = 'literal'

# Select columns
cols = ['text', 'cleaned_text', 'source', 'label']

positive_samples = pd.concat([synthetic_pos[cols], real_tp[cols], real_fn[cols]], ignore_index=True)
negative_samples = pd.concat([synthetic_neg[cols], real_tn[cols], real_fp[cols]], ignore_index=True)

# Report class sizes and composition
n_pos = len(positive_samples)
n_neg = len(negative_samples)
print("\nClass composition summary:\n")
print(f"Positive class (sarcastic):  {n_pos}")
print(f"  Synthetic sarcastic: {len(synthetic_pos)}")
print(f"  Real TP (true sarcastic): {len(real_tp)}")
print(f"  Real FN (missed sarcastic): {len(real_fn)}\n")

print(f"Negative class (literal): {n_neg}")
print(f"  Synthetic literal: {len(synthetic_neg)}")
print(f"  Real TN (true literal): {len(real_tn)}")
print(f"  Real FP (missed literal): {len(real_fp)}")

imbalance = abs(n_pos - n_neg) / max(n_pos, n_neg)
if imbalance > 0.2:
    print(f"WARNING: There is a class imbalance of {imbalance*100:.1f}%.")
    if n_pos < n_neg:
        print(f"Suggest sampling {n_neg - n_pos} more positive examples from real data not in high-confidence set.")
    else:
        print(f"Suggest sampling {n_pos - n_neg} more negative examples from real data not in high-confidence set.")
else:
    print("Classes are reasonably balanced.")

# Combine and shuffle
balanced_df = pd.concat([positive_samples, negative_samples], ignore_index=True)
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the clean, well-formatted dataset
output_path = 'balanced_training_dataset.csv'
balanced_df.to_csv(output_path, index=False)
print(f'Clean, formatted balanced training dataset saved to {output_path}')


Class composition summary:

Positive class (sarcastic):  7271
  Synthetic sarcastic: 2754
  Real TP (true sarcastic): 3818
  Real FN (missed sarcastic): 699

Negative class (literal): 7732
  Synthetic literal: 2249
  Real TN (true literal): 4301
  Real FP (missed literal): 1182
Classes are reasonably balanced.
Clean, formatted balanced training dataset saved to balanced_training_dataset.csv
