In [None]:
import os
import json
import re
import nltk

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from wordcloud import WordCloud, STOPWORDS
from spacy.lang.en import English
from collections import defaultdict

EDA on the datalabels

https://www.kaggle.com/narendra/datalabel-eda

In [None]:
pd.options.display.max_rows=100

In [None]:
nlp=English()
wordcloud=WordCloud(stopwords=STOPWORDS, 
                    width=600,
                    height=300,
                    background_color='white',
                    #max_font_size=50,
                    #max_words=100
                   )

In [None]:
%%time
pub_df=pd.read_csv('../input/publication-datasets/publication_dataset.csv')
pub_df.head()

Positive Samples-sentences on which the dataset labels match.

Negative Samples-sentences on which the dataset labels did not match.

# lets see how many positive samples are there in publications

In [None]:
pub_df.num_positive_samples.describe()

In [None]:
print("90% of the publications had {} sentences which mentions the datalabels".format(pub_df.num_positive_samples.quantile(q=0.9)))


In [None]:
plt.figure(figsize=(12, 5))
plt.xticks(rotation=45)
sns.countplot(data=pub_df, x='num_positive_samples')
plt.show()

# let us see the the ratio of the postive to negative samples

In [None]:
pub_df['pos_neg_ratio']=pub_df.num_positive_samples.div(pub_df.num_negative_samples)
pub_df.head()

In [None]:
pub_df[pub_df.num_negative_samples==0]

one publication had all the sentences as positive samples

In [None]:
pub_df[pub_df.num_negative_samples>0].pos_neg_ratio.describe()

In most cases number of positive sentences is <1% compared to the negative sentences.

In [None]:
pub_df.head()

Now that we had the positive samples; lets us see the words that appear in the context of the positive samples

In [None]:
pos_sentences=[]
for sentences in pub_df.sentences.values:
    sentences=eval(sentences)
    pos_sents=sentences['pos_sents']
    pos_sentences+=pos_sents
    
print(len(pos_sentences))

In [None]:
%%time
all_sentence_text=''
for sent in pos_sentences:
    all_sentence_text+=sent.lower()+" "

In [None]:
wordcloud_image=wordcloud.generate(all_sentence_text)
plt.imshow(wordcloud_image)
plt.show()

In [None]:
def get_sent_tokens(sent):
    doc=nlp(sent)
    tokens=[]
    for token in doc:
        if token.is_stop or token.is_punct or token.is_digit:
            continue
        token=token.lower_.strip()
        if len(token)<=2:
            continue
        tokens.append(token)
    return tokens

In [None]:
pos_sent_df=pd.DataFrame.from_dict({'sentence': pos_sentences})
pos_sent_df['tokens']=pos_sent_df.sentence.apply(get_sent_tokens)

pos_sent_df.head()

# Unigrams


In [None]:
unigrams=defaultdict(int)
for tokens in pos_sent_df.tokens:
    for token in tokens:
        unigrams[token]+=1
unigram_df=pd.DataFrame.from_dict({
    'word': list(unigrams.keys()),
    'freq': list(unigrams.values())
})

unigram_df=unigram_df.sort_values('freq', ascending=False)
unigram_df.head()

In [None]:
print("Number Of Unigrams:", len(unigrams))
print("Number Of Unigrams >2 freq:", unigram_df[unigram_df.freq>2].shape[0])
print("Number Of Unigrams >10 freq:", unigram_df[unigram_df.freq>10].shape[0])

In [None]:
unigram_df[unigram_df.freq>5].freq.describe()

In [None]:
plt.hist(unigram_df[unigram_df.freq>5].freq, bins=100)
plt.show()

# word freq > 10

In [None]:
unigram_df=unigram_df[unigram_df.freq>10].copy()
unigram_df.head()

In [None]:
unigram_df.head(20)

In [None]:
unigram_df.tail(20)

The above unigrams represents the topics generated in the sentences.
but seems noisy when considering the context.

To get the context lets mask the document labels from the sentences and the get the context words in the window.

# get only the context word around the datalabel(window size=5)

In [None]:
def get_context_tokens(sentence, direction, w=4):
    sentence=sentence.strip()
    tokens=[]
    for token in nlp(sentence):
        if not token.is_alpha:
            continue
        tokens.append(token.text)
    if direction == -1:
        return tokens[-w:]
    return tokens[:w]


def get_context(row):
    dataset_labels=eval(row['dataset_label'])
    sentences=eval(row['sentences'])['pos_sents']
    
    context={
        'left': [],
        'right': []
    }
    for sentence in sentences:
        for dl in dataset_labels:
            for match in re.finditer(dl, sentence):
                start=match.start()
                end=match.end()
                
                left_sentence=sentence[:start]
                right_sentence=sentence[end:]
                
                left_context=get_context_tokens(left_sentence, -1)
                right_context=get_context_tokens(right_sentence, 1)
                
                context['left'].append(left_context)
                context['right'].append(right_context)
    return context

In [None]:
pub_df['context']=pub_df.apply(get_context, axis=1)
context_df=pd.DataFrame()
context_df['left_context']=pub_df['context'].apply(lambda context: context['left'])
context_df['right_context']=pub_df['context'].apply(lambda context: context['right'])

pub_df.head()

In [None]:

context_df.head(10)

In [None]:
context_words=defaultdict(int)
left_context_words=defaultdict(int)
right_context_words=defaultdict(int)

all_context_text=''
left_context_text=''
right_context_text=''

for left_context in context_df.left_context.values:
    for words in left_context:
        for word in words:
            word=word.lower()
            context_words[word]+=1
            left_context_words[word]+=1
            left_context_text+=word+" "
            all_context_text+=word+" "

for right_context in context_df.right_context.values:
    for words in right_context:
        for word in words:
            word=word.lower()
            context_words[word]+=1
            right_context_words[word]+=1
            right_context_text+=word+" "
            all_context_text+=word+" "

print("Number Of Context Words", len(context_words))
print('Number Of Left Context Words:', len(left_context_words))
print('Number Of Right Context Words:', len(right_context_words))

context_word_df=pd.DataFrame.from_dict({
    'word': list(context_words.keys()),
    'freq': list(context_words.values())
})

left_context_word_df=pd.DataFrame.from_dict({
    'word': list(left_context_words.keys()),
    'freq': list(left_context_words.values())
})

right_context_word_df=pd.DataFrame.from_dict({
    'word': list(right_context_words.keys()),
    'freq': list(right_context_words.values())
})

context_word_df=context_word_df.sort_values('freq', ascending=False)
left_context_word_df=left_context_word_df.sort_values('freq', ascending=False)
right_context_word_df=right_context_word_df.sort_values('freq', ascending=False)
context_word_df.head(20)

looks like the top unigrams have words related mostly the prepositions, and related areas of interst

lets remove more frequent and less frequent unigrams
>10 and <800

In [None]:
context_word_df=context_word_df[(context_word_df.freq>10) & (context_word_df.freq<800)].copy()
context_word_df.freq.describe()

In [None]:
context_word_df.head(20)

In [None]:
all_context_wc=wordcloud.generate(all_context_text)

plt.figure(figsize=(15, 4))
plt.title('All Context')
plt.imshow(all_context_wc)
plt.show()


In the above word cloud observed that most words are releavant to the seach of datasets but dominated by some of the words that appear most in publications

Dominated Words from publications:
1. alzheimer
2. education
3. neuroimaging

Context words
1. dataset
2. sample
3. et al
4. cohert
5. database
6. obtained, study, using etc.

In [None]:
def get_context_bigrams(context):
    ctx_bigrams=[]
    for ctx_list in context:
        if len(ctx_list)<=1:
            continue
        for bg in nltk.bigrams(ctx_list):
            ctx_bigrams.append( ' '.join(bg).lower() )
    return ctx_bigrams

In [None]:
context_df['left_bigrams']=context_df.left_context.apply(get_context_bigrams)
context_df['right_bigrams']=context_df.right_context.apply(get_context_bigrams)
context_df.head()


In [None]:
context_bigrams=defaultdict(int)

for bgs in context_df.left_bigrams.values:
    for bg in bgs:
        context_bigrams[bg]+=1

for bgs in context_df.right_bigrams.values:
    for bg in bgs:
        context_bigrams[bg]+=1

context_bigrams_df=pd.DataFrame.from_dict({
    'bigram': list(context_bigrams.keys()),
    'freq': list(context_bigrams.values())
})

context_bigrams_df=context_bigrams_df.sort_values('freq', ascending=False)
print('Number Of Context Bigrams:', len(context_bigrams))
context_bigrams_df.head(30)

In [None]:
context_bigrams_df[context_bigrams_df.bigram.apply(lambda x: 'sample' in x)].head()

In [None]:
context_bigrams_df[context_bigrams_df.bigram.apply(lambda x: 'taken' in x)].head()

taking a look at some common bigrams

1. taken from (may be from left)
2. were taken (may be from right)
3. sampled from 
4. from the
5. on the.
6. et al, etc...

we can see from the above that, we need to focus on the terms/phrases that are specific to datasets like
extraction, sampling etc.

where as some of the phrases are highly specific to the content of the publication.

By combining the Context-level features and lexical-level(word shapes, cap-letters etc.) we can understand get the candidate phrases of the dataset.