#### Randomly Select 500 Samples for Human Labeling

In [None]:
import pandas as pd
import numpy as np
import re
import os

# ======================
# 1) Read CSV
# ======================
base_dir = os.getcwd()   # works in Jupyter
data_path = os.path.abspath(os.path.join(base_dir, '..', 'data'))
classified_path = os.path.join(data_path, "full_covid_abuse.csv") 
df = pd.read_csv(classified_path)

# ======================
# 2) Parse timestamp
# ======================
df["created_at"] = pd.to_datetime(df["created_at"], errors="coerce")
df = df.dropna(subset=["created_at"])

# ======================
# 3) Sort chronologically
# ======================
df = df.sort_values("created_at").reset_index(drop=True)

# ======================
# 4) Clean text for filtering
# ======================
def clean_text(text):
    text = str(text)
    text = re.sub(r'http\S+', ' ', text)       # URLs
    text = re.sub(r'@\w+', ' ', text)         # mentions
    text = re.sub(r'#\w+', ' ', text)         # hashtags
    text = re.sub(r'\s+', ' ', text).strip()  # whitespace
    return text

df["clean_text"] = df["text"].apply(clean_text)

# ======================
# 5) Filter > 2 words
# ======================
df["word_count"] = df["clean_text"].str.split().apply(len)
filtered = df[df["word_count"] > 2].copy()

print("Available for sampling:", len(filtered))

# ======================
# 6) Separate abusive/non-abusive pools
# ======================
abusive = filtered[filtered["is_abusive"] == 1]
non_abusive = filtered[filtered["is_abusive"] == 0]

print("Abusive available:", len(abusive))
print("Non-abusive available:", len(non_abusive))

# ======================
# 7) Sample 250 of each (or fallback to max available)
# ======================
np.random.seed(42)

num_each = 250

sample_abusive = abusive.sample(
    n=min(num_each, len(abusive)),
    replace=False,
    random_state=42
)

sample_non_abusive = non_abusive.sample(
    n=min(num_each, len(non_abusive)),
    replace=False,
    random_state=42
)

# Combine
sampled = pd.concat([sample_abusive, sample_non_abusive], ignore_index=True)

# Shuffle to avoid ordering bias
sampled = sampled.sample(frac=1, random_state=42).reset_index(drop=True)

print("Final sample size:", len(sampled))

# ======================
# 8) Save for human labeling
# ======================
keep_cols = ['tweet_id', 'original_text']
out_path_without_lable = os.path.join(data_path, "labled_by_human.csv")
sampled[keep_cols].to_csv(out_path_without_lable, index=False)

print("Saved:", out_path_without_lable)
