In [None]:
import pandas as pd
import numpy as np

In [None]:
# TODO Define a bias value X
X = 0.5 # TODO: Change this value to the desired bias value
L="ol" # TODO: Change this value to 'hs' or 'ol' to select the desired label

if L == "hs":
    LL = 'hate.speech'
elif L == "ol":
    LL = 'offensive.language'

# read the csvs from emnlp paper
train = pd.read_csv('data/full_test_s.csv')
test = pd.read_csv('data/full_train_s.csv')
# Combine the training and testing datasets into one DataFrame
all_data = pd.concat([train, test], axis=0)

# Sort the combined dataset by 'tweet.id' and 'version' in ascending order
all_data.sort_values(by=['tweet.id', 'version'], ascending=[True, True], inplace=True)

# Calculate the average hate speech label (p_i) for each 'tweet.id' across all annotators and versions (15 versions)
all_data['p_i'] = all_data.groupby('tweet.id')[LL].transform('mean')

# Compute biased probabilities p^A_i and p^B_i for each tweet by adjusting p_i by X
all_data['p^A_i'] = all_data['p_i'] - X  # Bias towards reducing hate speech probability
all_data['p^B_i'] = all_data['p_i'] + X  # Bias towards increasing hate speech probability

# Ensure that the biased probabilities are within the valid range [0, 1]
all_data['p^A_i'] = all_data['p^A_i'].clip(0, 1)
all_data['p^B_i'] = all_data['p^B_i'].clip(0, 1)

In [None]:
# train test split: 500 for test
df_train = all_data[(all_data['tweet.id'] >= 1) & (all_data['tweet.id'] <= 2500)]
df_test = all_data[(all_data['tweet.id'] >= 2501) & (all_data['tweet.id'] <= 3000)]

df_test.to_csv(f"data/test_{L}_label_{X}.csv")

## Balanced dataset (2 A, 2 B)
# Create a balanced dataset with equal representation from annotator types A and B
df_balanced = df_train
df_balanced.to_csv(f"data/balanced_{L}_label_{X}.csv")

## Unweighted dataset (2 A, 1 B)
# Create an unweighted dataset with a 2:1 ratio of annotator types A to B

# Filter the dataset to get labels from annotator type A and B separately
df_a_biased = df_train[df_train['attribute'] == 0]  # Labels from annotator type A
df_b_biased = df_train[df_train['attribute'] == 1]  # Labels from annotator type B

# Randomly sample half of the B-biased labels
half_length = len(df_b_biased) // 2
rows_to_drop = df_b_biased.sample(n=half_length).index
df_reduced = df_b_biased.drop(rows_to_drop)

# Combine the full A-biased dataset with the reduced B-biased dataset
df_unweighted = pd.concat([df_a_biased, df_reduced], axis=0)
df_unweighted.to_csv(f"data/unweighted_{L}_label_{X}.csv")

## Weighted dataset (2 A, 1 B * 2)

# Create a weighted dataset where the B labels are duplicated to balance with A labels

# Duplicate each row in the reduced B dataset to balance the count with A labels
df_duplicated = df_reduced.loc[df_reduced.index.repeat(2)].reset_index(drop=True)

# Combine the full A-biased dataset with the duplicated B-biased dataset
df_weighted = pd.concat([df_a_biased, df_duplicated], axis=0)

df_weighted.to_csv(f"data/weighted_{L}_label_{X}.csv")

## Additional Unweighted dataset (3 A, 1 B)
# Randomly sample half of the A-biased labels
half_a_length = len(df_a_biased) // 2
df_a_sampled = df_a_biased.sample(n=half_a_length)

# Double the sampled A-biased labels to make 3 A's
df_a_tripled = pd.concat([df_a_biased, df_a_sampled]).reset_index(drop=True)

# Combine the tripled A-biased dataset with the full B-biased dataset
df_unweighted_3A_1B = pd.concat([df_a_tripled, df_reduced], axis=0)

df_unweighted_3A_1B.to_csv(f"data/unweighted_3A1B_{L}_label_{X}.csv")
