#### Data pipeline for https://www.kaggle.com/c/feedback-prize-2021
* Run POS tagging on the text
* Label the text according to the BIO tagging convention per dict below (mapping below taken from https://www.kaggle.com/abhishek/two-longformers-are-better-than-1)

In [105]:
target_id_map = {
    "B-Lead": 0,
    "I-Lead": 1,
    "B-Position": 2,
    "I-Position": 3,
    "B-Evidence": 4,
    "I-Evidence": 5,
    "B-Claim": 6,
    "I-Claim": 7,
    "B-Concluding Statement": 8,
    "I-Concluding Statement": 9,
    "B-Counterclaim": 10,
    "I-Counterclaim": 11,
    "B-Rebuttal": 12,
    "I-Rebuttal": 13,
    "O": 14,
    "PAD": -100,
}
#target_id_map = {k: str(v) for k, v in ra}

In [72]:
#nltk.help.upenn_tagset()

In [109]:
import numpy as np
import pandas as pd
import nltk
import random
import os
from tqdm import tqdm
import glob

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')

tagdict = nltk.data.load('help/tagsets/upenn_tagset.pickle')

tagmap = {k:v for v, k in enumerate(sorted(tagdict.keys())) }

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rajiv\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\rajiv\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\rajiv\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


In [94]:
df = pd.read_csv("../../data/train.csv")
df.discourse_type_num.unique()

array(['Lead 1', 'Position 1', 'Evidence 1', 'Evidence 2', 'Claim 1',
       'Evidence 3', 'Evidence 4', 'Claim 2', 'Evidence 5',
       'Concluding Statement 1', 'Counterclaim 1', 'Rebuttal 1',
       'Claim 3', 'Claim 4', 'Claim 5', 'Claim 6', 'Claim 7',
       'Counterclaim 2', 'Rebuttal 2', 'Counterclaim 3', 'Rebuttal 3',
       'Evidence 6', 'Lead 2', 'Counterclaim 4', 'Counterclaim 5',
       'Counterclaim 6', 'Evidence 7', 'Claim 8', 'Evidence 8',
       'Concluding Statement 2', 'Rebuttal 4', 'Rebuttal 5', 'Claim 9',
       'Position 2', 'Claim 10', 'Claim 11', 'Claim 12', 'Evidence 9',
       'Concluding Statement 3', 'Concluding Statement 4', 'Evidence 10',
       'Evidence 11', 'Rebuttal 6', 'Evidence 12'], dtype=object)

### Append 2 columns for (a) POS tags and (b) BIO tags

create a lookup for filename, word index => discource_type

In [92]:
l_dict = {}

for _, row in df.T.to_dict().items():
    f_name = row['id']
    d_type = row['discourse_type_num']
    p_string = row['predictionstring']

    for w_idx in p_string.split():
        w_idx = int(w_idx.strip())
        l_dict[(str(f_name), w_idx)] = d_type


read in all files in training set, generate POS tags. Generate a dataframe with cols = `id, discourse_text, discourse_type, predictionstring, POS tags, BIO tags`
* go through each line in labeled dataset, read in the file
* run POS tags using NLTK
* for each file read in, split to words and check if word index exists in labeled data
* create a list of dicts with dict keys = columnns

In [96]:
e_data = []
for f_name in tqdm(df.id.unique()):
    f_name = str(f_name)
    f_rows = []
    with open(f"../../data/train/{f_name}.txt") as t_file:
        l_class = None
        l_disc_type = None
        f_content = t_file.read().split()
        pos_tags = nltk.pos_tag(f_content)
        pos_tags = [tagmap.get(pt[1], -1) for pt in pos_tags]
        c_text, c_pred_str, c_pos, c_bio = [], [], [], []
        for idx, token in enumerate(f_content):
            c_class = l_dict.get((f_name, idx), "O")
            if c_class == "O":
                c_disc_type = "O"
            else:
                c_disc_type = " ".join(c_class.split(" ")[:-1])
            if l_class is None: 
                tmp_c = c_class
                if c_class != "O":
                    tmp_c = "B-"+c_disc_type
                c_text, c_pred_str, c_pos, c_bio = [token], [idx], [pos_tags[idx]], [target_id_map[tmp_c]]
            elif c_class != l_class:
                c_row = {"id": f_name, "discourse_type": l_disc_type, "discourse_text": c_text, "predictionstring":c_pred_str, "pos_tags":c_pos, "bio_tags": c_bio}
                f_rows.append(c_row)
                if c_class != "O":
                    c_text, c_pred_str, c_pos, c_bio = [token], [idx], [pos_tags[idx]], [target_id_map["B-"+c_disc_type]]
                else:
                    c_text, c_pred_str, c_pos, c_bio = [token], [idx], [pos_tags[idx]], [target_id_map[c_disc_type]]
            else:
                c_text.append(token)
                c_pred_str.append(idx)
                c_pos.append(pos_tags[idx])
                tmp_c = c_class
                if c_class != "O":
                    tmp_c = "I-"+c_disc_type
                c_bio.append(target_id_map[tmp_c])
            l_class = c_class
            l_disc_type = c_disc_type
        # handle last row
        c_row = {"id": f_name, "discourse_type": l_class, "discourse_text": c_text, "predictionstring":c_pred_str, "pos_tags":c_pos, "bio_tags": c_bio}
        f_rows.append(c_row)
    e_data.extend(f_rows)


100%|██████████| 15594/15594 [02:55<00:00, 88.97it/s] 


In [97]:
#e_data = [{"id": row["id"], "discourse_text": " ".join(row["discourse_text"]), "predictionstring": " ".join(row["predictionstring"]), "pos_tags": " ".join(row["pos_tags"])} for row in e_data]
ldata_df = pd.DataFrame(e_data)

In [None]:
ldata_df[ldata_df["id"]=="4C471936CD75"]

In [None]:
df[df["id"]=="4C471936CD75"]

In [104]:
ldata_df.to_csv("../../data/train_v2.csv")

In [None]:
!del /f ..\..\data\train\*.gt

In [119]:
c_file = e_data[0]['id']
if glob.glob("../../data/train/*.gt"):
    print("labeled files exist. Delete prior to proceeding")
    
for row in e_data:
    f_name = row['id'] + ".gt"

    with open(f"../../data/train/{f_name}", "a") as f:
        f.write("".join([str(x).__add__(" ") for x in row["bio_tags"]]))

