In [1]:
import numpy as np
import pandas as pd
import nltk
import random
from transformers import BertTokenizer
import torch
import os
from sklearn import preprocessing

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rajiv\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
df = pd.read_csv("../data/train.csv")
le = preprocessing.LabelEncoder()
le.fit(df.id)
df['f_id'] = le.transform(df.id)
df.head()
f_mapping = df[["id", "f_id"]].drop_duplicates()
f_mapping.to_csv("../inputs/f_mapping.csv")

In [3]:
df[df.discourse_id=="0FB0700DAF44"]

Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring,f_id


In [4]:
df.head()
discource_types = {dtype: idx for idx, dtype in enumerate( sorted(df.discourse_type.unique().tolist()))}
num_types = max(discource_types.values())+1
discource_types["None"] = num_types

In [12]:
discource_types

{'Claim': 0,
 'Concluding Statement': 1,
 'Counterclaim': 2,
 'Evidence': 3,
 'Lead': 4,
 'Position': 5,
 'Rebuttal': 6,
 'None': 7}

### Go through train.csv, split each file into sentences and identify percentage of sentences that have 2 or more classes

In [5]:
from collections import defaultdict, OrderedDict
idx_map = dict()
#df = df.query(' id == "423A1CA112E2" ')
for _, row in df.iterrows():
    f_id, f_name, discourse_type, p_string = row["f_id"], row["id"], row["discourse_type"], row["predictionstring"]
    preds = [int(p.strip()) for p in p_string.split()]
    for p in preds:
        idx_map[(f_id, p)] = discourse_type

sent_labels = OrderedDict()
line_idxs = dict()
tot_sents = 0
for _, row in df.iterrows():
    f_id, f_name, discourse_type, p_string = row["f_id"], row["id"], row["discourse_type"], row["predictionstring"]
    with open(f"../data/train/{f_name}.txt") as f:
        txt = f.read()
        w_index = 0
        wind_start, wind_end = 0, 0
        for s_num, line in enumerate(nltk.sent_tokenize(txt)):
            counts = defaultdict(int)
            tot_sents += 1
            words = line.split()
            wind_end += len(words) -1 #we want to save the end word index prior to truncate
            if len(words) > 440: # truncate very long sentences so that they can fit into 512 tokens limit
                words = line.split()[:440]
                line = " ".join(words)
            for i, w in enumerate(words):
                if (f_id, w_index) in idx_map:
                    counts[idx_map[(f_id, w_index)]] += 1
                else:
                    counts["None"] += 1
                w_index += 1
            sent_labels[(f_id, s_num, line)] = [(d_type, count) for d_type, count in counts.items()]
            line_idxs[(f_id, s_num, line)] = (wind_start, wind_end)
            wind_start = wind_end+1


In [6]:
sent_labels[list(sent_labels.keys())[0]]

[('None', 1), ('Lead', 8)]

### Less than 1% of sentences have multiple classes. 99% belong to a single class

In [7]:
print("Total percentage of sentences that have multiple classes ", len([s for s, v in sent_labels.items() if len(v) >= 2]) / tot_sents)
sent_labels2 = defaultdict(list) # key = filename, value = list of (sentence, label)
for s, labels in sent_labels.items():
    totcnt = sum([count for d_type, count in labels])
    pcts = {d_type:count/totcnt for d_type, count in labels}
    maj_dtype = None
    mval = None    
    for dtype, pct in pcts.items():
        if mval is None or mval < pct:
            maj_dtype = dtype
    sent_labels2[s[0]].append((s[2], maj_dtype, *line_idxs[s]))

Total percentage of sentences that have multiple classes  0.008408270973207461


In [8]:
#sent_labels2[list(sent_labels2.keys())[0]]
#print(list(sent_labels2.keys())[0])

## Basic model - identify sentences that have some discourse type vs the ones that have none

99% of sentences belong to a single class. We will split the text by sentence and build a classifier that looks at each sentence independently as a start



In [9]:
# find average number of lines in each file
import numpy as np

linecnts = []
wordcnts = []
for f_name in list(sent_labels2.keys())[:10]:
    linecnt = len(sent_labels2[f_name])
    w_cnt = 0
    for line in sent_labels2[f_name]:
        w_cnt += len(line[0].split())
    linecnts.append(linecnt)
    wordcnts.append(w_cnt)
print("median number of lines", np.median(np.array(linecnts)))
print("median number of words", np.median(np.array(wordcnts)))

median number of lines 24.0
median number of words 387.0


#### Tokenize input per BERT

In [10]:
class DataProcessor:
    def __init__(self, writings):
        klist = list(writings.keys())
        random.shuffle(klist)
        self.writings = [(k, writings[k]) for k in  klist]
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

        self.train_out = []
        self.validation_out = []

    def split(self):
        '''
        train, validation and test split by meeting
        '''
        # split 90-10 train/validation
        tr_len = 0.9*len(self.writings)
        tr_len = int(round(tr_len))

        self.train_list = self.writings[:tr_len]
        self.validation_list = self.writings[tr_len:]

    
    def extract(self, tokens, labels, wbounds):
        last_cutoff = None
        labs = []
        wbd = []
        l_idx = 0
        for i, v in enumerate(tokens):
            if i == 512:
                return tokens[:last_cutoff+1], labs, wbd
            if v == "[SEP]":
                last_cutoff = i
                labs.append(labels[l_idx])
                wbd.append(wbounds[l_idx])
                l_idx += 1
        return tokens, labs, wbd

    def format(self, f_id, chunk, labels, s_wbounds):
        cls_vid = self.tokenizer.vocab["[CLS]"]
        sep_vid = self.tokenizer.vocab["[SEP]"]

        input_ids = self.tokenizer.convert_tokens_to_ids(chunk)
        attn_masks = [1]*len(input_ids)
        cls_ids = [i for i, t in enumerate(input_ids) if t == cls_vid ]
        mask_cls = [1 for _ in range(len(cls_ids))]
        wstarts = [st for st, _ in s_wbounds]
        wends = [en for _, en in s_wbounds]
        oh_labels = [[0]*(num_types+1) for _ in range(512)]
        for lnum, lab in enumerate(labels):
            oh_labels[lnum][lab] = 1

        [attn_masks.append(0) for _ in range(len(attn_masks), 512)]
        [input_ids.append(0) for _ in range(len(input_ids), 512)]
        [cls_ids.append(0) for _ in range(len(cls_ids), 512)]
        [mask_cls.append(0) for _ in range(len(mask_cls), 512)]

        _segs = [-1] + [i for i, t in enumerate(input_ids) if t == sep_vid]
        segs = [_segs[i] - _segs[i - 1] for i in range(1, len(_segs))]
        segments_ids = []
        for i, s in enumerate(segs):
            if (i % 2 == 0):
                segments_ids += s * [0]
            else:
                segments_ids += s * [1]

        #[cur_labels.append(0) for _ in range(len(cur_labels), 512)]
        [segments_ids.append(0) for _ in range(len(segments_ids), 512)]
        [wstarts.append(0) for _ in range(len(wstarts), 512)]
        [wends.append(0) for _ in range(len(wends), 512)]

        b_data_dict = {"id": f_id, "src": input_ids, "labels": oh_labels, "segs": segments_ids, 
                    'clss': cls_ids, "attn": attn_masks, "mask_cls":mask_cls,
                    "wstarts": wstarts, "wends":wends}
        return b_data_dict


    def format_to_bert(self, args=None):

        for ds, ds_out in ((self.train_list, self.train_out), (self.validation_list, self.validation_out)):
            for f_id, lines in ds:
                tokens = "".join(["[CLS]".__add__(l).__add__("[SEP]") for l, _, _, _ in lines])
                labels = [discource_types[lab] for _, lab, _, _ in lines]
                    
                w_bounds = [(wstart, wend) for _, _, wstart, wend in lines]
                remain = self.tokenizer.tokenize(tokens)
                while len(remain) != 0:
                    tmp_tokens, tmp_labels, tmp_wbounds = self.extract(remain, labels, w_bounds)
                    remain = remain[len(tmp_tokens):]
                    labels = labels[len(tmp_labels):]
                    w_bounds = w_bounds[len(tmp_wbounds):]
                    ds_out.append(self.format(f_id, tmp_tokens, tmp_labels, tmp_wbounds))

    def save(self, out_dir):
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)

        for batch, ds in (("train", self.train_out), ("validation", self.validation_out)):
            out = dict()
            for k, v in ds[0].items():
                out[k] = []

            for sample in ds:
                for key, val in sample.items():
                    out[key].append(val)
            for k, v in out.items():
                out[k] = torch.LongTensor(v)
            for k, v in out.items():
                torch.save(v, out_dir+"/"+k+"_"+batch+".pt")


dp = DataProcessor(sent_labels2)
dp.split()
dp.format_to_bert()

In [11]:
dp.save("C:/Users/rajiv/dev/evalwriting/inputs")