In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data Description
The objective of the competition is to identify the mention of datasets within scientific publications. Your predictions will be short excerpts from the publications that appear to note a dataset. Predictions that more accurately match the precise words used to identify the dataset within the publication will score higher. Predictions should be cleaned using the clean_text function from the Evaluation page to ensure proper matching.

Publications are provided in JSON format, broken up into sections with section titles.

The goal in this competition is not just to match known dataset strings but to generalize to datasets that have never been seen before using NLP and statistical techniques. A percentage of the public test set publications are drawn from the training set - not all datasets have been identified in train, so these unidentified datasets have been used as a portion of the public test labels. These should serve as guides for the difficult task of labeling the private test set.

Note that the hidden test set has roughly ~8000 publications, many times the size of the public test set. Plan your compute time accordingly.

# Files
- train - the full text of the training set's publications in JSON format, broken into sections with section titles
- test - the full text of the test set's publications in JSON format, broken into sections with section titles
- train.csv - labels and metadata for the training set
- sample_submission.csv - a sample submission file in the correct format

# Columns
- id - publication id - note that there are multiple rows for some training documents, indicating multiple mentioned datasets
- pub_title - title of the publication (a small number of publications have the same title)
- dataset_title - the title of the dataset that is mentioned within the publication
- dataset_label - a portion of the text that indicates the dataset
- cleaned_label - the dataset_label, as passed through the clean_text function from the Evaluation page

In [None]:
import pandas as pd

train_df = pd.read_csv('/kaggle/input/coleridgeinitiative-show-us-the-data/train.csv')
train_df.sample(5)

In [None]:
import json
file = open('/kaggle/input/coleridgeinitiative-show-us-the-data/train/{}'.format(train_df['Id'].iloc[1])+'.json')
text_json = json.load(file)

In [None]:
# text_json  # [{'section_title':'', 'text':''}]
# len(text_json)
# text_json[8].get('section_title')

In [None]:
# file = open('coleridgeinitiative-show-us-the-data/test/2100032a-7c33-4bff-97ef-690822c43466.json')
# text_json = json.load(file)
# text_json

In [None]:
def get_all_text(json_file):
    
    file = open('/kaggle/input/coleridgeinitiative-show-us-the-data/train/{}'.format(json_file)+'.json')
    text_json = json.load(file)
    length = len(text_json)
    text_list = []
    
    for i in range(length):
        text_list.append(text_json[i].get('text'))
    text = ' '.join(text_list)
    
    return text

def get_all_section_titles(json_file):
    
    file = open('/kaggle/input/coleridgeinitiative-show-us-the-data/train/{}'.format(json_file)+'.json')
    text_json = json.load(file)
    length = len(text_json)
    section_titles_list = []
    
    for i in range(length):
        section_titles_list.append(text_json[i].get('section_title'))
    section_titles = ' '.join(section_titles_list)
    
    return section_titles

In [None]:
train_df['text'] = train_df['Id'].apply(get_all_text)
train_df['section_titles'] = train_df['Id'].apply(get_all_section_titles)

In [None]:
train_df.drop(columns=['dataset_label','Id'],inplace=True) 

In [None]:
import re

def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

In [None]:
train_df['section_titles'] = train_df['section_titles'].apply(clean_text)
train_df['text'] = train_df['text'].apply(clean_text)
train_df['dataset_title'] = train_df['dataset_title'].apply(clean_text)
train_df['pub_title'] = train_df['pub_title'].apply(clean_text)

In [None]:
train_df.sample(5)

In [None]:
from collections import Counter

print('No of Data Titles:',len(Counter(train_df['dataset_title'])))
print('No of Data Labels:',len(Counter(train_df['cleaned_label'])))

In [None]:
import os

test_data = []

for i in os.listdir('/kaggle/input/coleridgeinitiative-show-us-the-data/test/'):
    
    file = open('/kaggle/input/coleridgeinitiative-show-us-the-data/test/{}'.format(i))
    text_json = json.load(file)
    length = len(text_json)
    text_list = []
    section_titles_list=[]
    
    for j in range(length):
        text_list.append(text_json[j].get('text'))
        section_titles_list.append(text_json[j].get('section_title'))
        text = ' '.join(text_list)
        section_titles=' '.join(section_titles_list)
        
    test_data.append([i.split('.')[0],text,section_titles])
    
test_df = pd.DataFrame(columns = ['Id','text','section_titles'], data =test_data)

In [None]:
test_df['text'] = test_df['text'].apply(clean_text)
test_df['section_titles'] = test_df['section_titles'].apply(clean_text)

In [None]:
test_df.drop(columns=['Id'],inplace=True)

In [None]:
test_df