In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Competition Objective**
This competition (The Coleridge Competition) seeks to predict the mentioning of datasets in scientific articles. The use of these predictions is to tie it to policymaking driven by data. 

Submissions for this competition need to include the id of the article and a string justifying the prediction (cleaned using the clean_text() function that is provided. Our model needs to predict this string.

In [None]:
# Imports
import json
import os
import glob

import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords

import re
import string

from wordcloud import WordCloud, STOPWORDS

In [None]:
# Take a look at the training data
train_0 = pd.read_json('/kaggle/input/coleridgeinitiative-show-us-the-data/train/f8b03c87-9d1a-4f20-b76b-cb6c69d447b2.json')
train_csv = pd.read_csv('/kaggle/input/coleridgeinitiative-show-us-the-data/train.csv')

In [None]:
train_csv

In [None]:
# Take a look at the title of the first document
train_csv.pub_title[0]

# **Notes about the Training data**
**train.csv** is a file linking the publication and datasets referenced. We find the publication texts in the individual .json files. As for the test data, we have 4 articles to parse and link to datasets. From what I understand now, we have to find strings within the articles' text that we find are likely to be references to datasets.

# **Exploratory Data Analysis (EDA)**

In [None]:
# Show how many different articles are in the training set, and how many datasets are referenced.
print(f'There are {len(train_csv.Id.unique())} different articles and {len(train_csv.cleaned_label.unique())} different datasets.')

Distribution of publications and datasets

In [None]:
sns.countplot(x=train_csv.Id.value_counts())

In [None]:
fig = plt.figure(figsize=(13, 6))
fig.suptitle('Distribution of articles and datasets', fontsize=20)

ax0 = plt.subplot2grid((1, 2), (0, 0))
ax1 = plt.subplot2grid((1, 2), (0, 1))

ax0.hist(train_csv.Id.value_counts())
ax0.set_xlabel("# of linked datasets by article")
ax0.set_ylabel("# of articles")

ax1.hist(train_csv.cleaned_label.value_counts())
ax1.set_xlabel("# of occurences of datasets in articles")
ax1.set_ylabel("# of datasets")
plt.show()

In [None]:
sns.kdeplot(x=train_csv.Id.value_counts())

In [None]:
sns.kdeplot(x=train_csv.cleaned_label.value_counts())

**Conclusion:** The distribution of linked datasets by articles is much less smoother than that of articles by datasets.

Let's look at the format of the articles

In [None]:
article0 = pd.read_json('/kaggle/input/coleridgeinitiative-show-us-the-data/train/d0fa7568-7d8e-4db9-870f-f9c6f668c17b.json')

In [None]:
article0

We can see above that this first article has 18 "section titles". The text from that section title is listed above to left of each section title.

In [None]:
# Display text of the first article
article0.text[0]

In [None]:
# Display the paragraph titles in a different way than above
for sentence in article0.section_title:
    print(''.join(sentence))

Let's visualize the training data

In [None]:
train_csv = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/train.csv')

In [None]:
stopwords = set(STOPWORDS)

wordcloud = WordCloud(background_color='white',
                      stopwords=stopwords,
                      max_words=200,
                      max_font_size=30,
                      scale=3,
                      random_state=1)
   
cloud = wordcloud.generate(str(train_csv['dataset_title'].unique()))

fig = plt.figure(figsize=(15, 15))
plt.axis('off')
plt.imshow(cloud)
plt.show()

The above wordcloud looks at the top 200 words in the corpus after stopwords (placeholder words) have been removed. We can see that there is a depth of various kinds of words, both in the frequency and the topics. This is what we might expect to find.

# Data Retrieval and Cleaning

In [None]:
# Imports, which repeats some of the previous imports, in case we remove that section
import json
import os
import re

import numpy
import pandas
from fuzzywuzzy import fuzz
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords

In [None]:
# Retrieve the training data and sample submission
directory = r"../input/coleridgeinitiative-show-us-the-data/"

train_csv = pandas.read_csv(directory + "/train.csv")
sample_submission = pandas.read_csv(directory + "/sample_submission.csv")

In [None]:
def retrieve_text(filename, type):
    json_path = os.path.join(directory, type, filename + ".json")

    section_title = []
    contents = []
    with open(json_path, mode='r') as recurse:
        json_contents = json.load(recurse)

        for data in json_contents:
            contents.append(data.get('section_title'))
            contents.append(data.get('text'))

        # section_title = data_cleaning(" ".join(section_title))
        contents = data_cleaning(" ".join(contents))

    return contents

In [None]:
def data_cleaning(text):
    text = re.sub('[^A-Za-z0-9]+', " ", text)
    text = re.sub(' +', ' ', text)
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               "]+", flags=re.UNICODE)
    cleaned_text = emoji_pattern.sub(r'', text)

    return cleaned_text.lower()

In [None]:
def load_json():
    train_csv['json-content'] = train_csv['Id'].apply(retrieve_text, args=('train',))
    test_set['json-content'] = sample_submission['Id'].apply(retrieve_text, args=('test',))
    # train_csv['acronym'] = train_csv['dataset_title'].progress_apply(create_patterns)
    # train_csv['fuzzy-ratio'] = train_csv.progress_apply(get_fuzzy_score, axis=1)

In [None]:
test_set = pandas.DataFrame()
test_set['Id'] = sample_submission['Id']
load_json()

Preprocess the data for NLP

In [None]:
# Define the preprocessing function
def preprocess_data(dataframe):
    unique_dataset_titles = dataframe['dataset_title'].unique()

    for dataset_title in unique_dataset_titles:
        try:
            if '(' in str(dataset_title):
                tmp_title = str(dataset_title).split(" ")
                
                tmp_title_without_braces = str(dataset_title).replace("(", "")
                tmp_title_without_braces = tmp_title_without_braces.replace(")", "").lower()
                tmp_title_without_braces = re.sub('[^A-Za-z]+', " ", tmp_title_without_braces)
                    
                for word in tmp_title:
                    if '(' in word:
                        acronyms_dict[str(word[1: -1]).lower()] = tmp_title_without_braces

            else:
                text = re.sub('[^A-Za-z]+', " ", str(dataset_title))
                clean_text = text.lower().split()
                clean_text = [clean_word for clean_word in clean_text if not clean_word in set(stop_words)]

                acronym_text = []
                for word in clean_text:
                    acronym_text.append(word[0: 1])

                acronyms_dict["".join(acronym_text)] = str(dataset_title).lower()

            tmp_title = str(dataset_title)
            tmp_title_without_braces = str(dataset_title).lower().split(" ")
            tmp_title = re.sub('[^A-Za-z0-9]+', " ", tmp_title).lower()
            tmp_title_without_braces = [word for word in tmp_title_without_braces
if not '(' in word]
            tmp_title_without_braces = re.sub('[^A-Za-z0-9]+', " ", str(tmp_title_without_braces)).lower()

            titles_prior1.add(tmp_title.strip())
            
            if tmp_title_without_braces.strip() not in titles_prior1:
                titles_prior2.add(tmp_title_without_braces.strip())
                titles_dict[tmp_title_without_braces.strip()] = tmp_title.strip()

        except:
            print("exception occurred for title: ", dataset_title)
            continue

    return acronyms_dict, titles_dict, titles_prior1, titles_prior2                                        

In [None]:
stop_words = stopwords.words('english')
acronyms = set()
titles_prior1 = set()
titles_prior2 = set()
acronyms_dict = {}
titles_dict = {}
acronyms_dict, titles_dict, titles_prior1, titles_prior2 = preprocess_data(train_csv)

In [None]:
# Display the acronyms dictionary
acronyms_dict

In [None]:
titles_prior1 = list(sorted(titles_prior1, key=len, reverse=True))
titles_prior2 = list(sorted(titles_prior2, key=len, reverse=True))
unique_cleaned_matches = train_csv['cleaned_label'].unique()

In [None]:
print(titles_prior1)

# Predict the results, and save it to a submission file

In [None]:
acronyms = acronyms_dict.keys()
match_out = []
for json_data in test_set['json-content']:
    match = ''
    tmp_set = set()

    for word in json_data.split():
        tmp_set.add(word)
    
    for clean_text in unique_cleaned_matches:
        if clean_text in str(json_data) and clean_text not in match:
            match += ('|' + clean_text if len(match) > 0 else clean_text)
            
    for query_prior1 in titles_prior1:
        query_text = str(query_prior1).lower()

        if query_text in str(json_data) and query_text not in match:
            match += ('|' + query_text if len(match) > 0 else query_text)

    for query_prior2 in titles_prior2:
        query_text = str(query_prior2).lower()

        if query_text in str(json_data) and query_text not in match:
            match += ('|' + query_text if len(match) > 0 else query_text)

    for query_text in acronyms:
        if len(query_text) > 3 and query_text in tmp_set and query_text not in match:
            match += ('|' + query_text if len(match) > 0 else query_text)

    match_out.append(match)

In [None]:
print(match_out)

In [None]:
# Test set results
result = pandas.DataFrame()
result['Id'] = test_set['Id']
result['PredictionString'] = match_out
result.to_csv('submission.csv', index=False)

In [None]:
# Training set results, not required for this competition
# result = pandas.DataFrame()
# result['Id'] = train_csv['Id']
# result['title'] = train_csv['dataset_title']
# result['clean'] = train_csv['cleaned_label']
# result['PredictionString'] = match_out
# result.to_csv('submission.csv', index=False)