### In this notebook, I sorted sections by their title score.

- Each title is scored by how many times sections with that title contained the first occurence of a dataset name.

- The idea is, when you sort sections like this, uppermost sections have a higher chance of having a dataset name in them.

- existing_labels: code from https://www.kaggle.com/c/coleridgeinitiative-show-us-the-data/discussion/232964

In [None]:
import json
import pandas as pd
import numpy as np
import glob
import os
import re
from tqdm import tqdm
import nltk
import random
from nltk.tokenize import word_tokenize,sent_tokenize

train_example_paths = glob.glob('../input/coleridgeinitiative-show-us-the-data/train/*.json')
train_example_names = [fn.split('.')[0] for fn in os.listdir('../input/coleridgeinitiative-show-us-the-data/train')]

metadata = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/train.csv')
docIdx = train_example_names.copy()

In [None]:
def load_train_example_by_name(name):
    doc_path = os.path.join('../input/coleridgeinitiative-show-us-the-data/train', name + '.json')
    with open(doc_path) as f:
        data = json.load(f)
    return data

def text_cleaning(text):
    text = re.sub('[^A-Za-z0-9]+', ' ', str(text)).strip() # remove unnecessary literals

    text = re.sub(r'\[[0-9]+]', ' specialreference ', text)

    # Remove years
    text = re.sub(r'(19|20)[0-9][0-9]', ' specialyear ', text)

    # remove other digits
    text = re.sub(r'\d+', ' ', text)

    # remove extra spaces
    text = re.sub("\s+"," ", text)

    # Remove websites
    text = ' '.join(['specialwebsite' if 'http' in t or 'www' in t else t for t in text.split(' ') ])

    return text.lower()

In [None]:
import string

temp_1 = [text_cleaning(x) for x in metadata['dataset_label']]
temp_2 = [text_cleaning(x) for x in metadata['dataset_title']]
temp_3 = [text_cleaning(x) for x in metadata['cleaned_label']]

existing_labels = temp_1 + temp_2 + temp_3
existing_labels = [l.lower() for l in existing_labels]
existing_labels = list(set(existing_labels))

# Sort labels by length in descending order
existing_labels = sorted(existing_labels, key = len, reverse= True)

## Extract Information

In [None]:
pos_sentences = []
neg_sentences = []
doc_label_section_idx = []
doc_label_sentence_idx = []
doc_label_list = []
first_label_sec_name = []
first_labels = []
n_secs = []

def process_doc(doc_id):
    doc_json = load_train_example_by_name(doc_id)
    this_doc_label_section_idx = []
    this_doc_label_sentence_idx = []
    this_doc_label_list = []
    i_doc_sent = -1
    doc_first_label = True
    n_secs.append(len(doc_json))

    for i_sec, section in enumerate(doc_json):
        
        sentences = sent_tokenize(section['text'])

        adni_count = 0
        for sentence in sentences:
            i_doc_sent += 1
            clean_sentence = text_cleaning(sentence)

            has_label = False
            label_is_adni = False
            for clean_label in existing_labels:
                if clean_label in clean_sentence:
                    if doc_first_label:
                        first_label_sec_name.append(section['section_title'])
                        first_labels.append(clean_label)
                        doc_first_label = False

                    has_label = True
                    this_doc_label_section_idx.append(i_sec)
                    this_doc_label_sentence_idx.append(i_doc_sent)
                    this_doc_label_list.append(clean_label)
                    clean_sentence = clean_sentence.replace(clean_label, '')

    doc_label_section_idx.append(this_doc_label_section_idx)
    doc_label_sentence_idx.append(this_doc_label_sentence_idx)
    doc_label_list.append(this_doc_label_list)
    if doc_first_label:
        first_label_sec_name.append('NOT FOUND')
        first_labels.append('NOT FOUND')

In [None]:
for doc_id in tqdm(docIdx):
    process_doc(doc_id)

## Get Section Title Info

In [None]:
def process_sec_name(text):
    text = re.sub('[^A-Za-z]+', ' ', str(text)).strip() # remove unnecessary literals

    # remove extra spaces
    text = re.sub("\s+"," ", text)

    text = ' '.join([t for t in text.split(' ') if len(t) > 1])

    return text.lower()

section_order = pd.Series(first_label_sec_name).value_counts().to_frame().reset_index()
section_order.columns = ['sec_name', 'cnt']

section_order.sec_name = section_order.sec_name.apply(lambda x: process_sec_name(x))
section_order = section_order.groupby('sec_name')['cnt'].sum().to_frame().reset_index()
section_order.columns = ['sec_name', 'cnt']
section_order = section_order.loc[section_order.sec_name.str.len() > 0]

# Consolidate rows that contain 'data'
sec_cons = section_order.sec_name.str.contains('data') | section_order.sec_name.str.contains('sample')
count_sum_data = section_order.loc[sec_cons, 'cnt'].sum()
section_order = section_order.loc[~sec_cons].reset_index(drop = True)
section_order.loc[len(section_order)] = ['data', count_sum_data]

# Consolidate rows that contain 'study'
sec_cons = section_order.sec_name.str.contains('study')
count_sum_data = section_order.loc[sec_cons, 'cnt'].sum()
section_order = section_order.loc[~sec_cons].reset_index(drop = True)
section_order.loc[len(section_order)] = ['study', count_sum_data]

section_order = section_order.loc[section_order.cnt > 10]
section_order = section_order.sort_values(by = 'cnt', ascending= False).reset_index(drop = True)

section_order.to_csv('section_order.csv', index = False)

In [None]:
section_order

## Sort Sections

In [None]:
def sort_doc_sections(doc_secs):
    # doc_secs must be a list of dicts with field'section_title'
    for sec in doc_secs:
        section_title = process_sec_name(sec['section_title'])
        if len(section_title) < 4:
            sec['score'] = 0
        else:
            sec_scores = section_order.loc[section_order.sec_name.str.contains(section_title) |\
                                           section_order.sec_name.apply(lambda x: x in section_title), 'cnt']
            # sum scores of all matches
            result_score = sec_scores.sum() if len(sec_scores) > 0 else 0

            sec['score'] = result_score

    return sorted(doc_secs, key = lambda x: x['score'], reverse = True)

## Example

In [None]:
ex_i = 100

doc_json = load_train_example_by_name(train_example_names[ex_i])
doc_json = [{'section_title': s['section_title']} for s in doc_json]

print(f'Section: {first_label_sec_name[ex_i]}')
print(f'Dataset Name: {first_labels[ex_i]}')
sort_doc_sections(doc_json)