# About Coleridge Initiative

The Coleridge Initiative is a not-for-profit organization originally established at New York University. It was set up in order to inform the decision-making of the Commission on Evidence-based Policymaking and has since worked with dozens of government agencies at the federal, state, and local levels to ensure that data are more effectively used for public decision-making.

It achieves this goal by working with the agencies to create value for the taxpayer from the careful use of data by building new technologies to enable secure access to and sharing of confidential microdata and by training agency staff to acquire modern data skills.

# Competition Challenge

The objective of the competition is to identify the mention of datasets within scientific publications. Your predictions will be short excerpts from the publications that appear to note a dataset. Predictions that more accurately match the precise words used to identify the dataset within the publication will score higher.

# Evaluation Metric

Submissions are evaluated on a Jaccard-based FBeta score between predicted texts and ground truth texts, with Beta = 0 (an F0 or precision score). Multiple predictions are delineated with a pipe (|) character in the submission file.

# Code Requirements

- CPU Notebook <= 9 hours run-time
- GPU Notebook <= 9 hours run-time
- Internet access disabled
- Freely & publicly available external data is allowed, including pre-trained models
- Submission file must be named submission.csv

In [None]:
%matplotlib inline

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import tqdm
from tqdm.auto import tqdm as tqdmp
tqdmp.pandas()

import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

from plotly.offline import iplot
#to link plotly to pandas
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline = False, world_readable = True)

import plotly.express as px #Plotly express

plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['axes.titlesize'] = 16
plt.style.use('seaborn-whitegrid')
sns.set_palette('Set3')

import json
import collections

import itertools
import collections
from collections import Counter

from nltk.corpus import stopwords

import re
from wordcloud import WordCloud

import gc

import os
print(os.listdir('/kaggle/input/coleridgeinitiative-show-us-the-data/'))

import warnings
warnings.simplefilter('ignore')

from time import time, strftime, gmtime
start = time()
import datetime
print(str(datetime.datetime.now()))

In [None]:
base_dir = '../input/coleridgeinitiative-show-us-the-data/'

In [None]:
train = pd.read_csv(base_dir + 'train.csv')
print(train.shape)
train.head()

In [None]:
sub = pd.read_csv(base_dir + 'sample_submission.csv')
print(sub.shape)
sub

In [None]:
print(f"Number of train json files: {os.listdir(base_dir + 'train/').__len__()}")
print(f"Number of test json files: {os.listdir(base_dir + 'test/').__len__()}")

# Merge json with DF

In [None]:
train_files = os.listdir(base_dir + 'train/')
print(train_files[:5])

The json files are named by the 'Id' in the train dataframe

In [None]:
train['Id'].values[0]

In [None]:
train_json = pd.read_json(base_dir + 'train/' + train['Id'].values[0] + '.json')
print(train_json.shape)
train_json.head()

__The json file contains the publication text separated by section title, we will have to join these texts (separated by sections) into a single text.__

The competition organizers have already cleaned the label with the below code and expects the submitted label to be cleaned the same way.

In [None]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

In [None]:
def extract_json(x):
    df = pd.read_json(base_dir + 'train/' + x + '.json')
    text = ' '.join([' '.join(each) for each in df.astype(str).values])
    text = clean_text(text)
    return text

In [None]:
train['text'] = train['Id'].progress_apply(lambda x: extract_json(x))
train.head()

In [None]:
train.to_csv('./train_publication.csv', index = False)

In [None]:
train.describe().T

In [None]:
train.info()

# Publication Title
__Let's explore the titles provided in the train set__

In [None]:
train['clean_pub_title'] = train['pub_title'].apply(clean_text)
train['pub_title_len'] = train['clean_pub_title'].apply(lambda x: len(str(x)))
train['pub_title_word_len'] = train['clean_pub_title'].apply(lambda x: len(str(x).split()))
train.head(2)

In [None]:
print(f"Mean pub title length: {np.mean(train['pub_title_len'])}")
train['pub_title_len'].iplot(kind = 'hist', 
                            bins = 100,
                            xTitle = 'Pub Title Length',
                            yTitle = 'Count',
                            title = 'Pub Title Length Distribution'
                            )

In [None]:
print(f"Mean pub title word length: {np.mean(train['pub_title_word_len'])}")
train['pub_title_word_len'].iplot(kind = 'hist', 
                            bins = 100,
                            xTitle = 'Pub Title Word Length',
                            yTitle = 'Count',
                            title = 'Pub Title Word Length Distribution'
                            )

__Plot Pub Title Word Cloud__

In [None]:
def plot_wordcloud(data, col, text = None):
    stop = stopwords.words('english')
    all_words = [word for each in data[col] for word in each.split() if word not in stop if len(word) > 1]
    word_freq = Counter(all_words)

    wordcloud = WordCloud(width = 900,
                          height = 500,
                          max_words = 200,
                          max_font_size = 100,
                          relative_scaling = 0.5,
                          background_color = "rgba(255, 255, 255, 0)", 
                          mode = "RGBA",
                          normalize_plurals = True).generate_from_frequencies(word_freq)
    plt.figure(figsize = (16, 12))
    plt.imshow(wordcloud, interpolation = 'bilinear')
    plt.title(text, fontsize = 16)
    plt.axis("off")
    plt.show()
    
    

In [None]:
plot_wordcloud(train, 'clean_pub_title', 'WordCloud of Train Pub_title')

In [None]:
def plot_most_frequent(df, top, col, title = None):
    stop = stopwords.words('english')
    all_words = [word for each in train[col] for word in each.split() if word not in stop if len(word) > 1]
    word_freq = Counter(all_words)
    freq = {'words': [w for w, c in word_freq.most_common(top)], 'counts': [c for w, c in word_freq.most_common(top)]}
    fig = px.bar(freq, 
                 x = 'words', 
                 y = 'counts', 
                 title = title
                )
    fig.show()

In [None]:
plot_most_frequent(train, 20, 'clean_pub_title', 'Top 20 Frequent words used in Pub_title')

- __Looks like most of the publications related to medical domain__

#  Cleaned Label

In [None]:
temp = train[['cleaned_label']].copy()
print(f"There are {train['cleaned_label'].nunique()} unique cleaned labels provided in the train set")

In [None]:
temp['cleaned_label_len'] = temp['cleaned_label'].apply(lambda x: len(str(x)))
print(f"Mean cleaned_label length: {np.mean(temp['cleaned_label_len'])}")
temp['cleaned_label_len'].iplot(kind = 'hist', 
                            xTitle = 'Cleaned Label Length',
                            yTitle = 'Count',
                            title = 'Cleaned Label Length Distribution'
                            )

In [None]:
temp['cleaned_label_wordlen'] = temp['cleaned_label'].apply(lambda x: len(str(x).split()))
print(f"Mean cleaned_label word length: {np.mean(temp['cleaned_label_wordlen'])}")
temp['cleaned_label_wordlen'].iplot(kind = 'hist', 
                            xTitle = 'Cleaned Label Length',
                            yTitle = 'Count',
                            title = 'Cleaned Label Word Length Distribution'
                            )
del temp
gc.collect()

In [None]:
plot_wordcloud(train, 'cleaned_label', 'WordCloud of Train Label')

In [None]:
plot_most_frequent(train, 20, 'cleaned_label', 'Top 20 Frequent words used in Cleaned Label')

In [None]:
train['cleaned_label'].value_counts()

# Dataset Title

In [None]:
temp = train[['dataset_title']].copy()
print(f"There are {train['dataset_title'].nunique()} unique dataset_title provided in the train set")

In [None]:
temp['dataset_title_len'] = temp['dataset_title'].apply(lambda x: len(clean_text(str(x))))
print(f"Mean dataset_title length: {np.mean(temp['dataset_title_len'])}")
temp['dataset_title_len'].iplot(kind = 'hist', 
                            xTitle = 'Dataset Title Length',
                            yTitle = 'Count',
                            title = 'Dataset Title Length Distribution'
                            )

In [None]:
temp['dataset_title_wordlen'] = temp['dataset_title'].apply(lambda x: len(clean_text(str(x)).split()))
print(f"Mean dataset_title length: {np.mean(temp['dataset_title_wordlen'])}")
temp['dataset_title_wordlen'].iplot(kind = 'hist', 
                            xTitle = 'Dataset Title Word Length',
                            yTitle = 'Count',
                            title = 'Dataset Title Word Length Distribution'
                            )

In [None]:
plot_wordcloud(train, 'dataset_title', 'WordCloud of Train Dataset Title')

In [None]:
plot_most_frequent(train, 20, 'dataset_title', 'Top 20 Frequent words used in Dataset Title')

In [None]:
train['dataset_title'].value_counts()

__Dataset Titles associated with Dataset Labels__

In [None]:
labels, n_labels = [], []
titles = []

for i, title in enumerate(train['dataset_title'].unique()):
    titles.append(title)
    label = train['dataset_label'][train['dataset_title'] == title].unique()
    labels.append(label)
    n_labels.append(len(labels))
titles_labels = pd.DataFrame({'Dataset Title': titles, 'Dataset Label': labels, 'Num Labels': n_labels})
titles_labels

# Publicaiton Text

In [None]:
train['text_len'] = train['text'].progress_apply(lambda x: len(x))
train['text_wordlen'] = train['text'].progress_apply(lambda x: len(x.split()))
train[['Id', 'text_len', 'text_wordlen']].head()

In [None]:
print(f"Mean text length: {np.mean(train['text_len'])}")
train['text_len'].iplot(kind = 'hist', 
                            bins = 100,
                            xTitle = 'Clean Text Length',
                            yTitle = 'Count',
                            title = 'Clean Text Length Distribution'
                            )

In [None]:
print(f"Mean text length: {np.mean(train['text_wordlen'])}")
train['text_wordlen'].iplot(kind = 'hist', 
                            bins = 100,
                            xTitle = 'Clean Text Word Length',
                            yTitle = 'Count',
                            title = 'Clean Text Word Length Distribution'
                            )

In [None]:
plot_wordcloud(train, 'text', 'WordCloud of Publication Text')

In [None]:
plot_most_frequent(train, 50, 'text', 'Top 50 Frequent words used in Cleaned Publication Text')

In [None]:
def extract_test_json(x):
    df = pd.read_json(base_dir + 'test/' + x + '.json')
    text = ' '.join([' '.join(each) for each in df.astype(str).values])
    text = clean_text(text)
    return text

In [None]:
sub['text'] = sub['Id'].progress_apply(lambda x: extract_test_json(x))
sub.shape

In [None]:
sub.head(2)

In [None]:
sub.to_csv('./test_publication.csv', index = False)

In [None]:
labels = []

datasets_titles = [x.lower() for x in set(train['dataset_title'].unique()).union(set(train['dataset_label'].unique()))]

for index in sub['Id']:
    print(index)
    pub_text = sub[sub['Id'] == index]['text'].str.cat(sep = '\n').lower()
    label = []
    for d_title in datasets_titles:
        if d_title in pub_text:
            label.append(clean_text(d_title))
            #print(label)
    labels.append('|'.join(label))

sub['PredictionString'] = labels

In [None]:
sub['PredictionString'] = labels
sub[['Id', 'PredictionString']].to_csv('./submission.csv', index = False)

sub[['Id', 'PredictionString']]

# WIP....

In [None]:
finish = time()
print(strftime("%H:%M:%S", gmtime(finish - start)))