# Interface - *style-transfer-paraphrase*

#### Prepare WNC

In [None]:
# Extract biased and neutral from biased.full
with open("../neutralizing-bias/src/bias_data/WNC/biased.full", "r", encoding="utf8") as source_file:
    with open("datasets/WNC/wnc_biased.txt", "w", encoding="utf8") as target_file_biased:
        with open("datasets/WNC/wnc_neutral.txt", "w", encoding="utf8") as target_file_neutral:
            for line in source_file:
                line_split = line.split("\t")
                target_file_biased.write(line_split[3] + "\n")
                target_file_neutral.write(line_split[4] + "\n")

In [2]:
# Extract neutral from biased.full and neutral
with open("../neutralizing-bias/src/bias_data/WNC/biased.full", "r", encoding="utf8") as source_file_1:
    with open("../neutralizing-bias/src/bias_data/WNC/neutral", "r", encoding="utf8") as source_file_2:
        with open("datasets/WNC/wnc_neutral_large.txt", "w", encoding="utf8") as target_file:
            for line in source_file_1:
                line_split = line.split("\t")
                target_file.write(line_split[4] + "\n")
            for line in source_file_2:
                line_split = line.split("\t")
                target_file.write(line_split[4])

In [4]:
# Randomly shuffle neutral
import random

with open("datasets/WNC/wnc_neutral_large.txt", "r", encoding="utf8") as source:
    data = [(random.random(), line) for line in source]
    data.sort()

with open("datasets/WNC/wnc_neutral_large_shuffle.txt", "w", encoding="utf8") as target:
    for _, line in data:
        target.write(line)

#### Split Train-Dev-Test

In [6]:
# 80-10-10 split for biased and neutral from biased.full
num_sentences = 181496

with open("datasets/WNC/wnc_biased.txt", "r", encoding="utf8") as source_file:
    with open("datasets/WNC_biased/train.txt", "w", encoding="utf8") as train_file:
        with open("datasets/WNC_biased/dev.txt", "w", encoding="utf8") as dev_file:
            with open("datasets/WNC_biased/test.txt", "w", encoding="utf8") as test_file:
                for i, line in enumerate(source_file):
                    if i < num_sentences * 0.8:
                        train_file.write(line)
                    elif i < num_sentences * 0.9:
                        dev_file.write(line)
                    else:
                        test_file.write(line)

with open("datasets/WNC/wnc_neutral.txt", "r", encoding="utf8") as source_file:
    with open("datasets/WNC_neutral/train.txt", "w", encoding="utf8") as train_file:
        with open("datasets/WNC_neutral/dev.txt", "w", encoding="utf8") as dev_file:
            with open("datasets/WNC_neutral/test.txt", "w", encoding="utf8") as test_file:
                for i, line in enumerate(source_file):
                    if i < num_sentences * 0.8:
                        train_file.write(line)
                    elif i < num_sentences * 0.9:
                        dev_file.write(line)
                    else:
                        test_file.write(line)

In [5]:
# 90-5-5 split for neutral from biased.full and neutral
num_sentences = 567023

with open("datasets/WNC/wnc_neutral_large_shuffle.txt", "r", encoding="utf8") as source_file:
    with open("datasets/WNC_neutral_large/train.txt", "w", encoding="utf8") as train_file:
        with open("datasets/WNC_neutral_large/dev.txt", "w", encoding="utf8") as dev_file:
            with open("datasets/WNC_neutral_large/test.txt", "w", encoding="utf8") as test_file:
                for i, line in enumerate(source_file):
                    if i < num_sentences * 0.9:
                        train_file.write(line)
                    elif i < num_sentences * 0.95:
                        dev_file.write(line)
                    else:
                        test_file.write(line)

#### Create Label Files

In [7]:
# Create labels for biased and neutral from biased.full
with open("datasets/WNC_biased/train.label", "w", encoding="utf8") as train_label_file:
    with open("datasets/WNC_biased/dev.label", "w", encoding="utf8") as dev_label_file:
        with open("datasets/WNC_biased/test.label", "w", encoding="utf8") as test_label_file:
            for i in range(num_sentences):
                if i < num_sentences * 0.8:
                    train_label_file.write("biased\n")
                elif i < num_sentences * 0.9:
                    dev_label_file.write("biased\n")
                else:
                    test_label_file.write("biased\n")


with open("datasets/WNC_neutral/train.label", "w", encoding="utf8") as train_label_file:
    with open("datasets/WNC_neutral/dev.label", "w", encoding="utf8") as dev_label_file:
        with open("datasets/WNC_neutral/test.label", "w", encoding="utf8") as test_label_file:
            for i in range(num_sentences):
                if i < num_sentences * 0.8:
                    train_label_file.write("neutral\n")
                elif i < num_sentences * 0.9:
                    dev_label_file.write("neutral\n")
                else:
                    test_label_file.write("neutral\n")

In [6]:
# Create labels for neutral from biased.full and neutral
with open("datasets/WNC_neutral_large/train.label", "w", encoding="utf8") as train_label_file:
    with open("datasets/WNC_neutral_large/dev.label", "w", encoding="utf8") as dev_label_file:
        with open("datasets/WNC_neutral_large/test.label", "w", encoding="utf8") as test_label_file:
            for i in range(num_sentences):
                if i < num_sentences * 0.9:
                    train_label_file.write("neutral\n")
                elif i < num_sentences * 0.95:
                    dev_label_file.write("neutral\n")
                else:
                    test_label_file.write("neutral\n")

#### Explore train data

In [None]:
import pickle

with open("datasets/paranmt_filtered/train.pickle", "rb") as f:
    parse_data = pickle.load(f)

In [None]:
print(parse_data[0])

Lines to change for CPU/GPU use:
- inference_utils.py: 53, 95, 96, 97
- utils.py: 15, 16, 52

In [1]:
from style_paraphrase.inference_utils import GPT2Generator

model_dir = "models/paraphraser_gpt2_large"
top_p_value = 0.6

paraphraser = GPT2Generator(model_dir, upper_length="same_5")

inputs = ["This is an example sentence."]

for input in inputs:
    paraphraser.modify_p(top_p=0.0)
    greedy_decoding = paraphraser.generate(input)
    print("\ngreedy sample:\n{}\n".format(greedy_decoding))
    paraphraser.modify_p(top_p=top_p_value)
    top_p_60_samples, _ = paraphraser.generate_batch([input, input, input])
    top_p_60_samples = "\n".join(top_p_60_samples)
    print("top_p = {:.2f} samples:\n{}\n".format(top_p_value, top_p_60_samples))

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at paraphraser_gpt2_large were not used when initializing GPT2LMHeadModel: ['transformer.extra_embedding_project.weight', 'transformer.extra_embedding_project.bias']
- This IS expected if you are initializing GPT2LMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2LMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  gpt2_sentences=torch.tensor([inst.sentence for inst in instances]),#.to(args.device),



greedy sample:
it's an example sentence.

top_p = 0.60 samples:
here's an example sentence.
it's a short example.
this is an example of the sentence.

