In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import seaborn as sns
import matplotlib.pyplot as plt


from collections import Counter, defaultdict
from spacy.lang.en import English

In [None]:
%%time
nlp=English()
train_df=pd.read_csv('../input/coleridgeinitiative-show-us-the-data/train.csv')
train_df.head()

In [None]:
datalabel_df=train_df.groupby('dataset_label')[['Id']].count().reset_index().rename(columns={
    'Id': 'Number Of Publications'
})

datalabel_df.head()

# Top 30 appearing datalabels

In [None]:
plt.figure(figsize=(15, 5))
plt.xticks( rotation='vertical')
sns.set_theme(style="dark")
sns.barplot(data=datalabel_df.sort_values('Number Of Publications', ascending=False).head(30), 
            x='dataset_label', 
            y='Number Of Publications',
            ci=False
           )
plt.show()

In [None]:
datalabel_df.sort_values('Number Of Publications', ascending=False).head(30)

# Bottom 30 refered dataset labels

In [None]:
plt.figure(figsize=(15, 5))
plt.xticks( rotation='vertical')
sns.set_theme(style="dark")
sns.barplot(data=datalabel_df.sort_values('Number Of Publications').head(30), 
            x='dataset_label', 
            y='Number Of Publications',
            ci=False
           )
plt.show()

In [None]:
datalabel_df.sort_values('Number Of Publications').head(30)

Looking at the most of the dataset labels it looks their topic is on
1. Alzimers
2. Education
3. Covid
4. Weather

In [None]:
def get_tokens(dl):
    tokens=[]
    doc=nlp(dl)
    for token in doc:
        tokens.append(token)
    return tokens

def get_word_shape(tokens):
    word_shapes=[]
    for token in tokens:
        word_shapes.append(token.shape_)
    return word_shapes

def get_num_stopwords(tokens):
    cnt=0
    for token in tokens:
        if token.is_stop:
            cnt+=1
    return cnt

def get_stop_words(tokens):
    stop_words=[]
    for token in tokens:
        if token.is_stop:
            stop_words.append(token)
    return stop_words

def get_num_punctuations(tokens):
    cnt=0
    for token in tokens:
        if token.is_punct:
            cnt+=1
    return cnt

def get_punct(tokens):
    puncts=[]
    for token in tokens:
        if token.is_punct:
            puncts.append(token)
    return puncts

def is_title(tokens):
    return [token.is_title for token in tokens]

def is_upper(tokens):
    return [token.is_upper for token in tokens]

In [None]:
datalabel_df['tokens']=datalabel_df.dataset_label.apply(get_tokens)
datalabel_df['word_shape']=datalabel_df.tokens.apply(get_word_shape)
datalabel_df['token_len']=datalabel_df.tokens.apply(lambda x:len(x))
datalabel_df['num_punct']=datalabel_df.tokens.apply(get_num_punctuations)
datalabel_df['puncts']=datalabel_df.tokens.apply(get_punct)
datalabel_df['is_title']=datalabel_df.tokens.apply(is_title)
datalabel_df['is_upper']=datalabel_df.tokens.apply(is_upper)
datalabel_df['num_stopwords']=datalabel_df.tokens.apply(get_num_stopwords)
datalabel_df['stopwords']=datalabel_df.tokens.apply(get_stop_words)


datalabel_df['is_start_capital']=datalabel_df.word_shape.apply(lambda ws: ws[0][0] == 'X')
datalabel_df['is_start_digit']=datalabel_df.word_shape.apply(lambda ws: ws[0][0] == 'd')


datalabel_df=datalabel_df.sort_values('Number Of Publications', ascending=False)
datalabel_df.head()

In [None]:
print('Number Of tokens:', datalabel_df.token_len.nunique() )

plt.figure(figsize=(15, 5))
plt.xticks( rotation='vertical')
sns.set_theme(style="dark")
sns.countplot(data=datalabel_df.sort_values('token_len'),
              x='token_len')

plt.show()

In [None]:
datalabel_df[datalabel_df.token_len==1]

looks like if token_len==1 --> acronynms.

# Starts with Capital Letter

In [None]:
plt.title('Distribution of Labels Starting With Capital Letters')
sns.countplot(data=datalabel_df,x='is_start_capital')
plt.show()


print("Percent of Datasets starting with capital letters:", 100 * datalabel_df.is_start_capital.sum()/len(datalabel_df))

In [None]:
print('Total Number Of datasets:', len(datalabel_df))

print('Number Of Datasets starting with Capital Letter:', datalabel_df.is_start_capital.sum())
print('Number Of Datasets starting with Digits:', datalabel_df.is_start_digit.sum())

In [None]:
datalabel_df[(datalabel_df.is_start_capital==False) & (datalabel_df.is_start_digit==False)]


In [None]:
datalabel_df.num_stopwords.value_counts()

In [None]:
sns.countplot(datalabel_df.num_stopwords)

In [None]:
datalabel_df.head()

# Most Common Word Shapes

In [None]:
wshape_freq=defaultdict(int)
for ws_list in datalabel_df.word_shape.values:
    for wshape in ws_list:
        wshape_freq[wshape]+=1

wshape_df=pd.DataFrame({
    'word_shape': list(wshape_freq.keys()),
    'freq': list(wshape_freq.values()),
})

wshape_df['wshape_len']=wshape_df.word_shape.apply(lambda x: len(x))
wshape_df=wshape_df.sort_values('freq')
wshape_df.head()

In [None]:
plt.figure(figsize=(12, 5))
plt.title('Word Shape Frequencies.')
plt.xticks(rotation='vertical')
sns.set_theme(style="dark")
sns.barplot(data=wshape_df, 
            x='word_shape',
            y='freq'
           )
plt.show()

In [None]:
wshape_df.sort_values('freq', ascending=False)

Most common unigram word-shape features 
1. Xx(m) --> Starts with Capital and followed by small
2. X(m) --> all the words are capital
3. x(m) --> all the words are small(this will come in conjunction with the other blocks as we observe that ~93% of cases starts with capital letters)

In [None]:
plt.title('Word Shape Length:')
sns.countplot(data=wshape_df, x='wshape_len')

In [None]:
wshape_df[wshape_df.wshape_len>6]

# Furthur Work:
1. Bigram of wordshapes
2. Getting Accronyms from word phrases.
3. Overlapping datasets