In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import os
import json
import re

In [None]:
# read in the csvs
df = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/train.csv')
train_files_path = '../input/coleridgeinitiative-show-us-the-data/train'
handlabeled = pd.read_csv('../input/coleridgehandlabeled/labels.csv')

In [None]:
def chunk_text(full_text, length=250, overlap=25):
    full_text = full_text.split()
    text_len = len(full_text)
    results = []
    i = 0
    while i < text_len:
        results.append(' '.join(full_text[i:i+length]))
        i = i + length - overlap
    return results

In [None]:
def clean_label_text(txt):
    txt = re.sub('[^A-Za-z0-9()]+', ' ', str(txt)).strip()
    toks = txt.split()
    if '(' in toks[-1]:
        txt = ' '.join(toks[:-1])
    return re.sub('\s+', ' ', str(txt)).strip()

def clean_text(txt):
    txt = re.sub('[^A-Za-z0-9()]+', ' ', str(txt)).strip()
    return re.sub('\s+', ' ', str(txt)).strip()

In [None]:
labels = handlabeled.labels.unique().tolist()
cleaned_labels = set([])
for label in labels:
    if label not in cleaned_labels:
        cleaned = clean_label_text(label)
        cleaned_labels.add(cleaned)
cleaned_labels = sorted(cleaned_labels, key=len, reverse=True)

In [None]:
def is_subset(element, items):
    for item in items:
        if element in item:
            return True
    return False

In [None]:
is_actual_label = set([])
id_input = df['Id'].unique()
df = df.set_index('Id')
y_true = {}
freq = {}
results = []
for _id in tqdm(id_input):
    df_labels = df.loc[_id].dataset_label
    if type(df_labels) == str:
        y_true[_id] = [df_labels]
    else:
        y_true[_id] = df_labels.tolist()
    json_path = os.path.join(train_files_path, (_id+'.json'))
    with open(json_path, 'r') as f:
        json_decode = json.load(f)
        full_text = ''
        for data in json_decode: # for each section of the document
            full_text += ' ' + data['section_title'] + ' ' + data['text']
        full_text = clean_text(full_text)
        for label in cleaned_labels:
            if label in full_text and label not in y_true[_id] and label:
                if not is_subset(label, y_true[_id]):
                    y_true[_id].append(label.strip())
        chunks = chunk_text(full_text)
        for chunk in chunks:
            chunk_labels = []
            for cleaned in sorted(y_true[_id], key=len, reverse=True):
                if cleaned in chunk and not is_subset(cleaned, chunk_labels):
                    chunk_labels.append(cleaned.strip())
                    if cleaned in freq:
                        freq[cleaned.strip()] += 1
                    else:
                        freq[cleaned.strip()] = 1
            results.append((_id, '|'.join(chunk_labels), chunk))
            if chunk_labels:
                is_actual_label.update(chunk_labels)

In [None]:
result_df = pd.DataFrame(results, columns=['ids', 'labels', 'chunks'])
result_df.to_csv('data.csv', index=False)

In [None]:
with open('frequencies.json', 'w') as f:
    json.dump(freq, f)

with open('y_true.json', 'w') as f:
    json.dump(y_true, f)

In [None]:
with open("labels.csv", 'w') as file:
    file.write('labels\n')
    for row in list(is_actual_label):
        file.write(row+'\n')