# purpose

- The dataset label must appear somewhere in the article. So, let's find out where does the answer located. Intuitativley, the label is likely to appeared in sections like "dataset", "methodology", .... Thus, we may ignore the irelavent sections.

- In addition, I will give a statistics EDA of the sections. Are there a common section structure of these papers? 



# functions

In [None]:
# from https://www.kaggle.com/jamesmcguigan/coleridge-string-literals
import numpy as np 
import pandas as pd
import simplejson
import re
import pydash
import sys
import os
from collections import defaultdict
from typing import *
from joblib import Parallel, delayed
from glob import glob


def clean_text(text: str) -> str:               return re.sub('[^A-Za-z0-9]+', ' ', str(text).lower()).strip()
def clean_texts(texts: List[str]) -> List[str]: return [ clean_text(text) for text in texts ] 

def read_json(index: str, test_train="test") -> Dict:
    filename = f"../input/coleridgeinitiative-show-us-the-data/{test_train}/{index}.json"
    with open(filename) as f:
        json = simplejson.load(f)
    return json
        
def json2text(index: str, test_train="test") -> str:
    json  = read_json(index, test_train)
    texts = [
        row["section_title"] + " " + row["text"] 
        for row in json
    ]
    texts = clean_texts(texts)
    text  = " ".join(texts)
    return text


def extract_label(text: str, lookup: Dict[str, Set[str]]) -> str:
    labels = []
    for label, values in lookup.items():
        if any([
            value in text
            for value in values
        ]):
            labels += [ label.strip() ]
            
    # label = "|".join(set(labels))  # multi label support
    label = Counter(labels).most_common(1)[0][0] if len(labels) else ""  # single most-popular label
    # print('extract_label', labels, '->', label)
    return label

In [None]:
def count_words(text):
    return len(text.split())


import plotly.graph_objects as go
import plotly.express as px

# prepare dataset

In [None]:
from pathlib import Path
from tqdm.notebook import tqdm
datapath = Path('../input/coleridgeinitiative-show-us-the-data/') 

## training set label and index

In [None]:
train_df = pd.read_csv(datapath / 'train.csv')
print(train_df.shape)
train_df.head()

## read json text

In [None]:
sections_df = pd.DataFrame()
if Path('./sections_df.pickle').exists():
    sections_df = pd.read_pickle('./sections_df.pickle')
else:
    # this will takes around 30 mins
    for i, row in tqdm(train_df.iterrows()):
        paper_df = pd.DataFrame(read_json(row['Id'], 'train'))
        # add title to the front
        sections = ['title'] + clean_texts(paper_df['section_title'].to_list())
        texts = clean_texts([row['pub_title']] + paper_df['text'].to_list())
        for sect, text in zip(sections, texts):
            sections_df = sections_df.append({'id':row['Id'], 'section': sect, 'label_in_section': row['cleaned_label'] in text, 'word_count': count_words(text)}, ignore_index=True)

In [None]:
sections_df.to_pickle('./sections_df.pickle')

### statistics of sections info

In [None]:
# number of unique sections
print(f"number of unique sections {sections_df['section'].nunique()} / {len(sections_df)}")

section_counts = sections_df['section'].value_counts()
print('the top 20 sections')
section_counts.head(20)


- Common sections like "abstracts", "introduction", "results", "methods", "discussions", "conclusions" are indeed popular

In [None]:
section_counts.tail(20)

In [None]:
# probability of apparence of dataset label of each sections
prob_labels = sections_df[['section','label_in_section']].groupby('section').mean()

prob_labels = prob_labels.join(section_counts)
prob_labels.columns=['prob label in section', 'section counts']

In [None]:
prob_labels

- there are indeed many mis-formated section

In [None]:
prob_labels.sort_values('section counts', ascending=False).head(20)


- dataset label seems to appear in many sections

In [None]:
prob_labels.sort_values('prob label in section', ascending=False).head(20)

- I should see the trend with constraints of section counts

In [None]:
prob_labels[prob_labels['section counts'] > 20].sort_values('prob label in section', ascending=False).head(20)