### Setup

In [None]:
!nvidia-smi

Sat Aug 13 18:29:21 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

### Utils

In [None]:
%%capture
%pip install -r /content/feedback/code/requirements.txt

# Imports

In [None]:
import os
import gc
import glob
import json
import pandas as pd
import numpy as np

from sklearn.metrics import f1_score, log_loss
import matplotlib.pyplot as plt
from itertools import chain

from copy import deepcopy
from dataclasses import dataclass

import torch
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.checkpoint

import math
import shutil

from torch.optim import AdamW
from transformers import get_cosine_schedule_with_warmup
from transformers.trainer_pt_utils import get_parameter_names

import pdb
import random
from collections import OrderedDict

from accelerate import Accelerator
from tqdm.auto import tqdm

import re


from datasets import Dataset
from tokenizers import AddedToken
from transformers import AutoTokenizer
from transformers import T5Tokenizer, T5ForConditionalGeneration


from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

from transformers.optimization import  Adafactor


# Config

In [None]:
config =json.loads("""{
    "debug": false,
    "seed": 453,
    "model_checkpoint": "t5-large",
    
    "batch_size": 2,
    "warmup_pct": 0.025,
    "num_epochs": 5,
    
    "max_length": 320,
    "model_dir": "../models/unsup_T5_generator",
    
    "fpe_dataset_dir": "../datasets/feedback-prize-effectiveness",
    "fold_path": "../datasets/processed/cv_map_topics_10_folds.parquet",
    "train_2021_path": "../datasets/feedback-prize-2021/train.csv",
    "train_essay_fpe21_dir": "../datasets/processed/fpe_21_train_essays.parquet",
    "train_essay_fpe22_dir": "../datasets/processed/fpe_22_train_essays.parquet",
    "test_essay_fpe22_dir": "../datasets/processed/fpe_22_test_essays.parquet",
    "n_folds":10
}""")

# Load Data

In [None]:
df = pd.read_csv(config["train_2021_path"]).rename(columns={"id": "essay_id"})
essay_df = pd.read_parquet(config["train_essay_fpe21_dir"]).rename(columns={"id": "essay_id"})

topic_df = pd.read_csv("../datasets/processed/fpe_2021_topics.csv")
topic_df = topic_df[["essay_id", "prompt", "topic_num"]].copy()
df = df.drop(columns=["predictionstring", "discourse_start", "discourse_end"])

In [None]:
def relaxed_search(text, substring, min_length=2, fraction=0.99999):
    """
    Returns substring's span from the given text with the certain precision.
    """

    position = text.find(substring)
    substring_length = len(substring)
    if position == -1:
        half_length = int(substring_length * fraction)
        half_substring = substring[:half_length]
        half_substring_length = len(half_substring)
        if half_substring_length < min_length:
            return [-1, 0]
        else:
            return relaxed_search(text=text,
                                  substring=half_substring,
                                  min_length=min_length,
                                  fraction=fraction)

    span = [position, position+substring_length]
    return span


def build_span_map(discourse_list, essay_text):
    reading_head = 0
    to_return = dict()

    for cur_discourse in discourse_list:
        if cur_discourse not in to_return:
            to_return[cur_discourse] = []

        matches = re.finditer(re.escape(r'{}'.format(cur_discourse)), essay_text)
        for match in matches:
            span_start, span_end = match.span()
            if span_end <= reading_head:
                continue
            to_return[cur_discourse].append(match.span())
            reading_head = span_end
            break

    # post process
    for cur_discourse in discourse_list:
        if not to_return[cur_discourse]:
            print("resorting to relaxed search...")
            to_return[cur_discourse] = [relaxed_search(essay_text, cur_discourse)]
    return to_return


def get_substring_span(texts, mapping):
    result = []
    for text in texts:
        ans = mapping[text].pop(0)
        result.append(ans)
    return result


def process_input_df(anno_df, notes_df):
    """pre-process input dataframe

    :param df: input dataframe
    :type df: pd.DataFrame
    :return: processed dataframe
    :rtype: pd.DataFrame
    """
    notes_df = deepcopy(notes_df)
    anno_df = deepcopy(anno_df)

    #------------------- Pre-Process Essay Text --------------------------#
    anno_df["discourse_text"] = anno_df["discourse_text"].apply(lambda x: x.strip())  # pre-process
    if "discourse_effectiveness" in anno_df.columns:
        anno_df = anno_df[["discourse_id", "essay_id", "discourse_text",
                           "discourse_type", "discourse_effectiveness"]].copy()
    else:
        anno_df = anno_df[["discourse_id", "essay_id", "discourse_text", "discourse_type"]].copy()

    tmp_df = anno_df.groupby("essay_id")[["discourse_id", "discourse_text"]].agg(list).reset_index()
    tmp_df = pd.merge(tmp_df, notes_df, on="essay_id", how="left")
    tmp_df["span_map"] = tmp_df[["discourse_text", "essay_text"]].apply(
        lambda x: build_span_map(x[0], x[1]), axis=1)
    tmp_df["span"] = tmp_df[["discourse_text", "span_map"]].apply(
        lambda x: get_substring_span(x[0], x[1]), axis=1)

    all_discourse_ids = list(chain(*tmp_df["discourse_id"].values))
    all_discourse_spans = list(chain(*tmp_df["span"].values))
    span_df = pd.DataFrame()
    span_df["discourse_id"] = all_discourse_ids
    span_df["span"] = all_discourse_spans
    span_df["discourse_start"] = span_df["span"].apply(lambda x: x[0])
    span_df["discourse_end"] = span_df["span"].apply(lambda x: x[1])
    span_df = span_df.drop(columns="span")

    anno_df = pd.merge(anno_df, span_df, on="discourse_id", how="left")
    return anno_df


In [None]:
df = process_input_df(df, essay_df)
df = pd.merge(df, essay_df, on="essay_id", how="left")
df = pd.merge(df, topic_df, on="essay_id", how="left")

resorting to relaxed search...


In [None]:
pd.options.display.max_colwidth = 500

In [None]:
def get_model_input_text(prompt, left_context, discourse_type):

    to_return = [
        f"Generate {discourse_type}", 
        f"Prompt: {prompt}",
        f"Context: {left_context}",
    ]
    return " || \n".join(to_return)

def get_model_output_text(discourse_text):
    return discourse_text


df["left_context"] = df[["essay_text", "discourse_start"]].apply(lambda x: x[0][:x[1]], axis=1)
df["model_input"] = df[["prompt", "left_context", "discourse_type"]].apply(
    lambda x: get_model_input_text(x[0], x[1], x[2]), axis=1
)

df["model_output"] = df["discourse_text"].apply(lambda x: get_model_output_text(x))

In [None]:
df[["model_input", "model_output"]].sample(1)

Unnamed: 0,model_input,model_output
142504,"Generate Evidence || \nPrompt: Should you ask multiple people for advice? || \nContext: We make decisions everyday, whether its what to eat for breakfast or a bigger decision like asking somebody on a date. People often consult other people such as their friends, family, and trusted adults before making a decision on something. We do this because getting multiple opinions can help us make the right choice by giving us a broader perspective on the situation. Some people may be biased if the q...","Bias is something that can affect a situation for the worse by giving an opinion that benefits one party or person in the situation. People may cover up or hide their bias to make it look like they are giving an answer from a neutral standpoint. When making a big decision in life it is important when getting advice to avoid getting a single biased answer by consulting more than one person. You may not realize it, but by only getting the opinions of people that you think will agree with you o..."


# Model

In [None]:
model = T5ForConditionalGeneration.from_pretrained(config["model_checkpoint"])
tokenizer = T5Tokenizer.from_pretrained(config["model_checkpoint"], model_max_length=config["max_length"])

ckpt = torch.load("../models/T5_generator/fpe_model_unsupervised.pth.tar")
model.load_state_dict(ckpt["state_dict"])

del ckpt
gc.collect()
torch.cuda.empty_cache()

Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.75G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

In [None]:
accelerator = Accelerator()
model = accelerator.prepare(model)

In [None]:
model.eval()
print()




# Inference

In [None]:
def generate_text(input_text):
    test_tokenized = tokenizer.encode_plus(
        input_text, 
        add_special_tokens=True, 
        return_tensors="pt", 
        truncation=True, 
        max_length=config["max_length"], 
        padding=False
    )
    test_input_ids  = test_tokenized["input_ids"]
    test_attention_mask = test_tokenized["attention_mask"]
    
    beam_outputs = model.generate(
        input_ids = test_input_ids.to("cuda"),
        attention_mask = test_attention_mask.to("cuda"),
        max_length=config["max_length"],
        early_stopping=True,
        num_beams=15,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        temperature=2.0,
        do_sample=True,
    )

    to_return = []
    for beam_output in beam_outputs:
        sent = tokenizer.decode(beam_output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        to_return.append(sent)
    torch.cuda.empty_cache()
    return to_return[0]

In [None]:
tqdm.pandas()

In [None]:
def generate_aumentation(essay_id, input_df):
    example_df = input_df[input_df["essay_id"]==essay_id].copy()
    example_df = example_df.sort_values(by="discourse_start")
    example_df = example_df.reset_index(drop=True)
    
    essay_text = example_df["essay_text"].values[0]
    starts = example_df["discourse_start"].tolist()
    ends = example_df["discourse_end"].tolist()
    
    starts.append(-1)
    
    fillers = [essay_text[:starts[0]]]
    
    for s, e in zip(ends, starts[1:]):
        fillers.append(essay_text[s:e])
               
    generated_texts = dict()

    for idx, (row_id, row) in enumerate(example_df.iterrows()):
        if idx == 0:
            model_input = row.model_input
            components = model_input.split("||")
            
            onset = components[0]
            prompt = components[1]
            running_context = components[2]
            
        else:
            original_input = row.model_input
            onset = original_input.split("||")[0]
            running_context =  running_context + f"{g}" + f"{fillers[idx]}"

        # print("--"*20)
        model_input = " || \n".join([onset, prompt, running_context])
        # print(model_input)
        
        g = generate_text(model_input)
        generated_texts[idx] = g
               
    # set_trace()
    generated_df = pd.Series(generated_texts).T
    generated_df = pd.DataFrame(generated_df)
    generated_df.columns = ["t5_generated"]
    result_df = pd.concat([example_df, generated_df], axis=1)
    result_df = result_df.drop(columns=["essay_text", "topic_num", "left_context", "model_input", "model_output"])
    
    current_essay = fillers[0]
    for idx in range(len(generated_df)):
        current_essay += generated_df.iloc[idx][f"t5_generated"]
        current_essay += fillers[idx+1]
    return result_df, essay_text, current_essay

In [None]:
all_essay_ids = df["essay_id"].unique().tolist()
random.shuffle(all_essay_ids)
essay_id = all_essay_ids[0]
essay_id

'E91C501B7AFA'

In [None]:
os.makedirs("../datasets/augmented_data/worker_3", exist_ok=True)

In [None]:
from IPython.core.debugger import set_trace

In [None]:
os.getcwd()

'/content/drive/MyDrive/FPE/working'

In [None]:
for essay_num in tqdm(range(len(all_essay_ids))):
    essay_id = all_essay_ids[essay_num]

        
    result_df, original_essay, generated_essay = generate_aumentation(essay_id, df)

    
    result_df.to_csv(f"../datasets/augmented_data/worker_3/df_{essay_id}.csv", index=False)

    content = {f"{essay_id}": generated_essay}
    with open(f"../datasets/augmented_data/worker_3/{essay_id}_augmented.json", "w") as f:
        json.dump(content, f)
    torch.cuda.empty_cache()
    # set_trace()

  0%|          | 0/15594 [00:00<?, ?it/s]