In [None]:
MAX_SAMPLE = None

In [None]:
# !pip install datasets --no-index --find-links=file:///kaggle/input/coleridge-packages/packages/datasets
# !pip install -q ../input/coleridge-packages/seqeval-1.2.2-py3-none-any.whl
# !pip install -q ../input/coleridge-packages/tokenizers-0.10.1-cp37-cp37m-manylinux1_x86_64.whl
# !pip install -q ../input/coleridge-packages/transformers-4.5.0.dev0-py3-none-any.whl

import os
import re
import json
import time
import datetime
import random
import glob
import importlib

import numpy as np
import pandas as pd

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

import torch
from transformers import AutoTokenizer, DataCollatorForLanguageModeling, \
AutoModelForMaskedLM, Trainer, TrainingArguments, pipeline, AutoConfig

sns.set()
random.seed(123)
np.random.seed(456)

from IPython.display import clear_output
clear_output()

In [None]:
# model_checkpoint = "bert-base-cased"

# MAX_LENGTH = 64
# OVERLAP = 20

# DATASET_SYMBOL = '$' # this symbol represents a dataset name
# NONDATA_SYMBOL = '#' # this symbol represents a non-dataset name

In [None]:
# train
train_path = '../input/coleridgeinitiative-show-us-the-data/train.csv'
paper_train_folder = '../input/coleridgeinitiative-show-us-the-data/train'

train = pd.read_csv(train_path)
train = train[:MAX_SAMPLE]
# Group by publication, training labels should have the same form as expected output.
# train = train.groupby('Id').agg({
#     'pub_title': 'first',
#     'dataset_title': '|'.join,
#     'dataset_label': '|'.join,
#     'cleaned_label': '|'.join
# }).reset_index()    
train.drop_duplicates(subset='Id', keep='last', inplace=True)

print('train size: ', len(train))
train.sample(5)

In [None]:
def clean_training_text(txt):
    """
    similar to the default clean_text function but without lowercasing.
    """
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt)).strip()

def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

def totally_clean_text(txt):
    txt = clean_text(txt)
    txt = re.sub(' +', ' ', txt)
    return txt

def text_cleaning(text):
    '''
    Converts all text to lower case, Removes special charecters, emojis and multiple spaces
    text - Sentence that needs to be cleaned
    '''
    text = re.sub('[^A-Za-z0-9]+', ' ', str(text).lower()).strip()
    text = re.sub(' +', ' ', text)
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    return text
    
paper_train_folder = '../input/coleridgeinitiative-show-us-the-data/train'
def read_json_pub(filename, train_data_path=paper_train_folder, output='text'):
    json_path = os.path.join(train_data_path, (filename+'.json'))
    headings = []
    contents = []
    combined = []
    with open(json_path, 'r') as f:
        json_decode = json.load(f)
        for data in json_decode:
            headings.append(data.get('section_title'))
            contents.append(data.get('text'))
            combined.append(data.get('section_title'))
            combined.append(data.get('text'))
    
    all_headings = ' '.join(headings)
    all_contents = ' '.join(contents)
    all_data = '. '.join(combined)
    
    if output == 'text':
        return all_contents
    elif output == 'head':
        return all_headings
    else:
        return all_data

In [None]:
annotated_datasets = {}
annotated_datasets['2100032a-7c33-4bff-97ef-690822c43466'] = [
    'Cardiovascular Health Study (CHS)',
]

annotated_datasets['2f392438-e215-4169-bebf-21ac4ff253e1'] = [
    "OECD's online Education Database",
]

annotated_datasets['3f316b38-1a24-45a9-8d8c-4e05a42257c6'] = [
    'North Carolina Emergency Management Spatial Data Download',
    'Sea, Lake, and Overland Surges from Hurricanes (SLOSH) basin models',
]

annotated_datasets['8e6996b4-ca08-4c0b-bed2-aaf07a4c6a60'] = [
    '2010 Nielsen Homescan Survey',
]

all_annotated_datasets = []
for key, val in annotated_datasets.items():
    all_annotated_datasets = all_annotated_datasets + val 
    
print(len(all_annotated_datasets))
all_annotated_datasets[:5]

In [None]:
tmp2 = pd.read_csv('../input/coleridge-additional-gov-datasets-22000popular/data_set_800_with20000popular.csv')
tmp3 = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/train.csv')

In [None]:
tmp2 = [x for x in tmp2['title'].unique() if len(str(x).split()) > 2]
tmp2 = [x for x in tmp2 if len(str(x).split()) < 8]
tmp3_ = [x for x in tmp3['cleaned_label'].unique() if len(str(x).split()) > 0]
tmp3_ += [x for x in tmp3['dataset_title'].unique()]
tmp3 = [clean_text(x) for x in np.unique(tmp3_)]

In [None]:
tmp4 = pd.read_csv('../input/bigger-govt-dataset-list/data_set_26897.csv')
tmp4 = [x for x in tmp4['title'].unique() if len(str(x).split()) > 2]
tmp4 = [x for x in tmp4 if len(str(x).split()) < 8]

In [None]:
all_datasets = np.unique(tmp2 + tmp3 + tmp4 + all_annotated_datasets)
all_datasets = np.unique([clean_text(x) for x in all_datasets])
print(len(all_datasets))
all_datasets[:5]

In [None]:
# literal_preds = []
to_append = []
train['ext_cleaned_label'] = ''
for paper_id in tqdm(train['Id'].values):
    to_append = [paper_id,'']
    large_string = str(read_json_pub(paper_id))
    clean_string = text_cleaning(large_string)
    for query_string in all_datasets:
        if query_string in clean_string:
            if to_append[1]!='' and clean_text(query_string) not in to_append[1]:
                to_append[1]=to_append[1]+'|'+clean_text(query_string)
            if to_append[1]=='':
                to_append[1]=clean_text(query_string)
#     literal_preds.append(*to_append[1:])
    train.loc[train['Id']==paper_id, 'ext_cleaned_label'] = to_append[1]

In [None]:
train.to_csv('./train_ext_data.csv', index=False)
pd.read_csv('./train_ext_data.csv').head()