# WNC Splits

In [2]:
import os
import random
from tqdm import tqdm

source_path = "../neutralizing-bias/src/bias_data/WNC"
target_path = "datasets/WNC"

In [None]:
def remove_duplicates(lines: list[str]) -> list[str]:
    """ Removes duplicates from a list of strings while preserving its order. """
    known_lines = set()
    new_lines = []

    for line in lines:
        if line in known_lines:
            continue
        new_lines.append(line)
        known_lines.add(line)

    return new_lines

#### 1. WNC_biased_word

In [None]:
train_list = []
dev_list = []
test_list = []

# Add sentences to list
with open(os.path.join(source_path, "biased.word.train"), "r", encoding="utf8") as in_file:
    for line in in_file:
        sentence = line.split("\t")[4]
        train_list.append(sentence)

train_list = remove_duplicates(train_list)

# Write sentences to file
with open(os.path.join(target_path, "WNC_biased_word/train.txt"), "w", encoding="utf8") as out_file:
    with open(os.path.join(target_path, "WNC_biased_word/train.label"), "w", encoding="utf8") as label_file:
        for sentence in train_list:
            out_file.write("{}\n".format(sentence))
            label_file.write("{}\n".format("neutral"))

# Add sentences to list
with open(os.path.join(source_path, "biased.word.dev"), "r", encoding="utf8") as in_file:
    for line in in_file:
        sentence = line.split("\t")[4]
        dev_list.append(sentence)

dev_list = remove_duplicates(dev_list)

# Write sentences to file
with open(os.path.join(target_path, "WNC_biased_word/dev.txt"), "w", encoding="utf8") as out_file:
    with open(os.path.join(target_path, "WNC_biased_word/dev.label"), "w", encoding="utf8") as label_file:
        for sentence in dev_list:
            out_file.write("{}\n".format(sentence))
            label_file.write("{}\n".format("neutral"))

# Add sentences to list
with open(os.path.join(source_path, "biased.word.test"), "r", encoding="utf8") as in_file:
    for line in in_file:
        sentence = line.split("\t")[4]
        test_list.append(sentence)

test_list = remove_duplicates(test_list)

# Write sentences to file
with open(os.path.join(target_path, "WNC_biased_word/test.txt"), "w", encoding="utf8") as out_file:
    with open(os.path.join(target_path, "WNC_biased_word/test.label"), "w", encoding="utf8") as label_file:
        for sentence in test_list:
            out_file.write("{}\n".format(sentence))
            label_file.write("{}\n".format("neutral"))

#### 2. WNC_biased_full

In [None]:
random.seed(42)

sentences = []
train_list = []
dev_list = []
test_list = []

# Read sentences
with open(os.path.join(source_path, "biased.full"), "r", encoding="utf8") as in_file:
    for line in in_file:
        sentence = line.split("\t")[4]
        sentences.append(sentence)

# Make sure that all sentences of biased.word.test are in the test set
with open(os.path.join(source_path, "biased.word.test"), "r", encoding="utf8") as in_file:
    for line in in_file:
        sentence = line.split("\t")[4]
        sentences.append(sentence)
        test_list.append(sentence)

sentences = remove_duplicates(sentences)
num_sentences = len(sentences)
random.shuffle(sentences)

# Add sentences to lists
for sentence in tqdm(sentences):
    if sentence in test_list:
        continue
    
    if len(test_list) < 0.05 * num_sentences:
        test_list.append(sentence)
    elif len(dev_list) < 0.05 * num_sentences:
        dev_list.append(sentence)
    else:
        train_list.append(sentence)

# Write sentences to files
with open(os.path.join(target_path, "WNC_biased_full/train.txt"), "w", encoding="utf8") as train_file:
    with open(os.path.join(target_path, "WNC_biased_full/train.label"), "w", encoding="utf8") as train_label_file:
        for sentence in train_list:
            train_file.write("{}\n".format(sentence))
            train_label_file.write("{}\n".format("neutral"))

with open(os.path.join(target_path, "WNC_biased_full/dev.txt"), "w", encoding="utf8") as dev_file:
    with open(os.path.join(target_path, "WNC_biased_full/dev.label"), "w", encoding="utf8") as dev_label_file:
        for sentence in dev_list:
            dev_file.write("{}\n".format(sentence))
            dev_label_file.write("{}\n".format("neutral"))
                
with open(os.path.join(target_path, "WNC_biased_full/test.txt"), "w", encoding="utf8") as test_file:
    with open(os.path.join(target_path, "WNC_biased_full/test.label"), "w", encoding="utf8") as test_label_file:
        for sentence in test_list:
            test_file.write("{}\n".format(sentence))
            test_label_file.write("{}\n".format("neutral"))

#### 3. WNC_large

In [None]:
random.seed(42)

sentences = []
train_list = []
dev_list = []
test_list = []

# Read sentences
with open(os.path.join(source_path, "biased.full"), "r", encoding="utf8") as in_file:
    for line in in_file:
        sentence = line.split("\t")[3]
        sentences.append(sentence)

with open(os.path.join(source_path, "neutral"), "r", encoding="utf8") as in_file:
    for line in in_file:
        sentence = line.split("\t")[3]
        sentences.append(sentence)

# Make sure that all sentences of biased.word.test are in the test set
with open(os.path.join(source_path, "biased.word.test"), "r", encoding="utf8") as in_file:
    for line in in_file:
        sentence = line.split("\t")[3]
        sentences.append(sentence)
        test_list.append(sentence)

sentences = remove_duplicates(sentences)
num_sentences = len(sentences)
random.shuffle(sentences)

# Add sentences to lists
for sentence in tqdm(sentences):
    if sentence in test_list:
        continue

    if len(test_list) < 0.05 * num_sentences:
        test_list.append(sentence)
    elif len(dev_list) < 0.05 * num_sentences:
        dev_list.append(sentence)
    else:
        train_list.append(sentence)

# Write sentences to files
with open(os.path.join(target_path, "WNC_large/train.txt"), "w", encoding="utf8") as train_file:
    with open(os.path.join(target_path, "WNC_large/train.label"), "w", encoding="utf8") as train_label_file:
        for sentence in train_list:
            train_file.write("{}\n".format(sentence))
            train_label_file.write("{}\n".format("neutral"))

with open(os.path.join(target_path, "WNC_large/dev.txt"), "w", encoding="utf8") as dev_file:
    with open(os.path.join(target_path, "WNC_large/dev.label"), "w", encoding="utf8") as dev_label_file:
        for sentence in dev_list:
            dev_file.write("{}\n".format(sentence))
            dev_label_file.write("{}\n".format("neutral"))

with open(os.path.join(target_path, "WNC_large/test.txt"), "w", encoding="utf8") as test_file:
    with open(os.path.join(target_path, "WNC_large/test.label"), "w", encoding="utf8") as test_label_file:
        for sentence in test_list:
            test_file.write("{}\n".format(sentence))
            test_label_file.write("{}\n".format("neutral"))

#### Targets

In [13]:
target_map = {}

with open(os.path.join(source_path, "biased.full"), "r", encoding="utf8") as in_file:
    for line in in_file:
        bias = line.split("\t")[3]
        neutral = line.split("\t")[4]
        target_map[neutral] = bias

with open(os.path.join(target_path, "WNC_biased_full/test.txt"), "r", encoding="utf8") as test_file:
    with open(os.path.join(target_path, "targets.txt"), "w", encoding="utf8") as targets:
        for line in test_file:
            neutral = line.strip()
            targets.write("\t".join([target_map[neutral], neutral]) + "\n")