# Coleridge Initiative - Show US the Data

* The objective of the competition is to identify the mention of datasets within scientific publications.
* Predictions that more accurately match the precise words used to identify the dataset within the publication will score higher.
* Predictions should be cleaned using the clean_text function from the Evaluation page to ensure proper matching.
* The goal in this competition is not just to match known dataset strings but to generalize to datasets that have never been seen before using NLP and statistical techniques.
* The hidden test set has roughly ~8000 publications, many times the size of the public test set.

In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import glob
# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")

import cv2
from wordcloud import WordCloud, STOPWORDS
#Text Processing
import re
import nltk
nltk.download('popular')

<h2>Files</h2>
<ul>
<li><strong>train</strong> - the full text of the training set's publications in JSON format, broken into sections with section titles</li>
<li><strong>test</strong> - the full text of the test set's publications in JSON format, broken into sections with section titles</li>
<li><strong>train.csv</strong> - labels and metadata for the training set</li>
<li><strong>sample_submission.csv</strong> - a sample submission file in the correct format</li>
</ul>

In [None]:
submittion_csv = pd.read_csv("../input/coleridgeinitiative-show-us-the-data/sample_submission.csv")
train_csv = pd.read_csv("../input/coleridgeinitiative-show-us-the-data/train.csv")
train_dir = glob.glob("../input/coleridgeinitiative-show-us-the-data/train/*")
test_dir = glob.glob("../input/coleridgeinitiative-show-us-the-data/test/*")

<h2>Columns</h2>
<ul>
<li><code>id</code> - publication <code>id</code> - note that there are multiple rows for some training documents, indicating multiple mentioned datasets</li>
<li><code>pub_title</code>&nbsp;- title of the publication (a small number of publications have the same title)</li>
<li><code>dataset_title</code> - the title of the dataset that is mentioned within the publication</li>
<li><code>dataset_label</code> - a portion of the text that indicates the dataset</li>
<li><code>cleaned_label</code> - the <code>dataset_label</code>, as passed through the <code>clean_text</code> function from the <a rel="nofollow" href="https://www.kaggle.com/c/coleridgeinitiative-show-us-the-data/overview/evaluation">Evaluation page</a></li>
</ul>

In [None]:
train_csv.head()

In [None]:
train_csv.describe()

In [None]:
#Total data: 19661
#unique Id count: 14316
#Ids are reused: 5345
id_counts = pd.value_counts(train_csv["Id"])
fig = px.bar(x=id_counts.values[:20], y=id_counts.index[:20])
x_axis = dict(tickmode = 'linear',
    tick0 = 0,
    dtick = 2)
y_axis = dict(autorange="reversed")
fig.update_layout(
    title="Id count",
    xaxis_title="Count",
    yaxis_title="Id",
    xaxis = x_axis,
    yaxis = y_axis
)
fig.show()

In [None]:
#Total data: 19661
#unique Publication Titles: 14316
#Publication Titles are reused: 5390
pub_counts = pd.value_counts(train_csv["pub_title"])
fig = px.bar(x=pub_counts.values[:20], y=pub_counts.index[:20])
x_axis = dict(tickmode = 'linear',
    tick0 = 0,
    dtick = 2)
y_axis = dict(autorange="reversed",
              tickmode="array",
              tickvals=list(range(len(pub_counts))),
              ticktext = pub_counts.index[:20].map(lambda x: x[:40])
    )
fig.update_layout(
    title="Pub Title Count",
    xaxis_title="Count",
    yaxis_title="Publication title",
    xaxis = x_axis,
    yaxis = y_axis
)
fig.show()

In [None]:
#Total data: 19661
#unique Dataset Titles: 45
dataset_title_counts = pd.value_counts(train_csv["dataset_title"])
fig = px.bar(x=dataset_title_counts.values[:20], y=dataset_title_counts.index[:20])

y_axis = dict(autorange="reversed",
              tickmode="array",
              tickvals=list(range(len(pub_counts))),
              ticktext = dataset_title_counts.index[:20].map(lambda x: x[:40])
    )
fig.update_layout(
    title="Pub Title Count",
    xaxis_title="Count",
    yaxis_title="Publication title",
    yaxis = y_axis
)
fig.show()

In [None]:
#Total data: 19661
#unique Dataset Titles: 45
clean_counts = pd.value_counts(train_csv["cleaned_label"])
fig = px.bar(x=clean_counts.values[:20], y=clean_counts.index[:20])
y_axis = dict(autorange="reversed",
              tickmode="array",
              tickvals=list(range(len(clean_counts))),
              ticktext = clean_counts.index[:20].map(lambda x: x[:40])
    )
fig.update_layout(
    title="Pub Title Count",
    xaxis_title="Count",
    yaxis_title="Publication title",
    yaxis = y_axis
)
fig.show()

In [None]:
def plot_wordcloud(column, title):
    stopwords = set(STOPWORDS) 
    wordcloud = WordCloud(width = 800, 
                          height = 800,
                          background_color ='white',
                          min_font_size = 10,
                          stopwords = stopwords).generate(' '.join(column)) 
    plt.figure(figsize = (8, 8), facecolor = None) 
    plt.imshow(wordcloud) 
    plt.axis("off") 
    plt.tight_layout(pad = 0) 
    plt.title('Wordcloud: ' + title, fontsize = 20)
    plt.show()  

In [None]:
plot_wordcloud(column = train_csv['pub_title'], title = 'Publication Title')

In [None]:
def preprocess_text(text):
    lst_stopwords = nltk.corpus.stopwords.words("english")
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
    lst_text = text.split()
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in 
                    lst_stopwords]

    lem = nltk.stem.wordnet.WordNetLemmatizer()    
    lst_text = [lem.lemmatize(word) for word in lst_text]

    text = " ".join(lst_text)
    return text

In [None]:
train_csv["clean_pub_title"] = train_csv["pub_title"].apply(lambda x: preprocess_text(x))
train_csv["clean_pub_title_len"] = train_csv["clean_pub_title"].apply(lambda x: len(x))
train_csv["clean_pub_title_word_count"] =train_csv["clean_pub_title"].apply(lambda x: len(str(x).split(" ")))
train_csv["clean_pub_title_char_count"] = train_csv["clean_pub_title"].apply(lambda x: sum(len(word) for word in str(x).split(" ")))
train_csv["clean_pub_title_avg_word_length"] = train_csv["clean_pub_title_char_count"] / train_csv["clean_pub_title_word_count"]

In [None]:
train_csv.head()

In [None]:
plot_wordcloud(column = train_csv['clean_pub_title'], title = 'Publication Title')

In [None]:
plot_wordcloud(column = train_csv['dataset_title'], title = 'Dataset Title')

In [None]:
plot_wordcloud(column = train_csv['cleaned_label'], title = 'Cleaned Label')

In [None]:
def plot_distribution(x, title):
    fig = px.histogram(x)
    fig.show()

In [None]:
pub_title_list = [("clean_pub_title_len", "Publication Title: Length Distribution"),
                 ("clean_pub_title_word_count", "Publication Title: Word Count Distribution"),
                 ("clean_pub_title_char_count", "Publication Title: Character Count Distribution"),
                 ("clean_pub_title_avg_word_length", "Publication Title: Average Word Length Distribution")]
for i, j in pub_title_list:
    plot_distribution(train_csv[i], j)

Refrences:https://www.kaggle.com/ishandutta/coleridge-complete-eda-in-one-notebook