In [None]:
#Libraries

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import os
import re
import json
import string
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from tqdm.autonotebook import tqdm
from functools import partial
from wordcloud import WordCloud, STOPWORDS
import nltk
import spacy
nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner'])
nlp.max_length = 4000000
from nltk.probability import FreqDist


<b>Train Data Exploration</b>

In [None]:
train = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/train.csv')
train.head()

In [None]:
train.columns

<h3><b>Data Description</b></h3>
<p></p>
<ul>
    <li><b>id-</b> publication id - note that there are multiple rows for some training documents, indicating multiple mentioned datasets.</li>
    <li><b>pub_title-</b>title of the publication (a small number of publications have the same title).</li>
    <li><b>dataset_title-</b>the title of the dataset that is mentioned within the publication.</li>
    <li><b>dataset_label-</b>a portion of the text that indicates the dataset.</li>
    <li><b>cleaned_label-</b>the dataset_label, as passed through the clean_text function from the Evaluation page.</li>
</ul>

In [None]:
train.info()

<b>So we have no 'NULL' values in the train data</b>

In [None]:
for col in train.columns:
    print(col + ":" + str(len(train[col].unique())))

<h4>Inference</h4>


- The Training Dataset has 19,661 samples but only 14,316 unique IDs in the dataset. This means that some publications include a multitude of datasets. 


- The pub_title unique count is also less than the Id unique counts. This points to the precense of several occurences of having 2 separate publications, each with a unique ID, but sharing the exact same title.


- Also, there are a total of 45 unique dataset_title and 130 unique dataset_label. It means that a single dataset could have multible labels throughout different publications.

<b>Sample Submission</b>

In [None]:
sample_sub = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv')
sample_sub.head()

<h3><b>Data Description</b></h3>
<p></p>
<ul>
    <li><b>id-</b> publication id </li>
    <li><b>PredictionString-</b>To be filled with equivalent of cleaned_label of train data..</li>
    
</ul>

<h3><b>Data Processing</b></h3>

<b>Now we will create a function to get the text from the JSON file and append it to the new column in table</b>

In [None]:
train_files_path = '../input/coleridgeinitiative-show-us-the-data/train'
test_files_path = '../input/coleridgeinitiative-show-us-the-data/test'

In [None]:
def json_to_text(filename, train_files_path=train_files_path, output='text'):
    json_path = os.path.join(train_files_path, (filename+'.json'))
    headings = []
    contents = []
    combined = []
    with open(json_path, 'r') as f:
        json_decode = json.load(f)
        for data in json_decode:
            headings.append(data.get('section_title'))
            contents.append(data.get('text'))
            combined.append(data.get('section_title'))
            combined.append(data.get('text'))
    
    all_headings = ' '.join(headings)
    all_contents = ' '.join(contents)
    all_data = '. '.join(combined)
    
    if output == 'text':
        return all_contents
    elif output == 'head':
        return all_headings
    else:
        return all_data

In [None]:
tqdm.pandas()
train['text'] = train['Id'].progress_apply(json_to_text)

<b>Let's see the Train Data now</b>

In [None]:
train.head()

<b>Now apply the function to submission Data</b>

In [None]:
tqdm.pandas()
sample_sub['text'] = sample_sub['Id'].progress_apply(partial(json_to_text, train_files_path=test_files_path))

In [None]:
sample_sub.head()

<b>Create a function to Preprocess the data using Basic NLP Filters (all text to lower case, Removes special charecters, emojis and multiple spaces)</b>

In [None]:
def text_cleaning(text):
    text = ''.join([k for k in text if k not in string.punctuation])
    text = re.sub('[^A-Za-z0-9]+', ' ', str(text).lower()).strip()
    text = re.sub(' +', ' ', text)
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    return text

In [None]:
tqdm.pandas()
train['text'] = train['text'].progress_apply(text_cleaning)

<h3><center>EDA with Visualization</h3>

In [None]:
ul = train['cleaned_label'].unique()
ul[0:5]

In [None]:
print('There are {} unique cleaned labels'.format(len(ul)))

### Here is wordcloud based on texts from first unique label

In [None]:
text = ' '.join(train['text'][train['cleaned_label']==ul[0]].sample(frac=0.3))
wordcloud = WordCloud(background_color='white', stopwords=STOPWORDS, width=2000, height=1200).generate(text)
barplot_dim = (15, 15)
ax = plt.subplots(figsize=barplot_dim, facecolor='w')
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Top 100 Most Common Words in Publications Text', fontsize=50)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()

### Here is wordcloud based on texts from 10th unique label

In [None]:
text = ' '.join(train['text'][train['cleaned_label']==ul[10]].sample(frac=0.3))
wordcloud = WordCloud(background_color='white', stopwords=STOPWORDS, width=2000, height=1200).generate(text)
barplot_dim = (15, 15)
ax = plt.subplots(figsize=barplot_dim, facecolor='w')
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Top 100 Most Common Words in Publications Text', fontsize=50)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()

### Here is wordcloud based on texts from 100th unique label

In [None]:
text = ' '.join(train['text'][train['cleaned_label']==ul[100]].sample(frac=0.3))
wordcloud = WordCloud(background_color='white', stopwords=STOPWORDS, width=2000, height=1200).generate(text)
barplot_dim = (15, 15)
ax = plt.subplots(figsize=barplot_dim, facecolor='w')
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Top 100 Most Common Words in Publications Text', fontsize=50)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()

<h3><center>Some Similarity Measures</h3>

### Cosine similarity function

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

def cosine(train, test):
    """
    Enter text one from each train and test set for cosine similarity
    """
    sw = stopwords.words('english')
    X_list = word_tokenize(train)
    Y_list = word_tokenize(test)
    l1 =[];l2 =[]
    # remove stop words from the string
    X_set = {w for w in X_list if not w in sw} 
    Y_set = {w for w in Y_list if not w in sw}
    # form a set containing keywords of both strings 
    rvector = X_set.union(Y_set) 
    for w in rvector:
        if w in X_set: l1.append(1) # create a vector
        else: l1.append(0)
        if w in Y_set: l2.append(1)
        else: l2.append(0)
    c = 0
        # cosine formula 
    for i in range(len(rvector)):
        c+= l1[i]*l2[i]
    cosine = c / float((sum(l1)*sum(l2))**0.5)
    return cosine

In [None]:
for j in range(0,len(sample_sub['text'])):
    print('Similarity of first train text with text {} from test data'.format(j))
    print(cosine(train['text'][0], sample_sub.loc[j,'text']))

In [None]:
for j in range(0,len(sample_sub['text'])):
    print('Similarity of second train text with text {} from test data'.format(j))
    print(cosine(train['text'][1], sample_sub.loc[j,'text']))

### Jaccard similarity

In [None]:
def jaccard_similarity(text_a, text_b):
    word_set_a, word_set_b = [set(text.split())
                              for text in [text_a, text_b]]
    num_shared = len(word_set_a & word_set_b)
    num_total = len(word_set_a | word_set_b)
    return num_shared / num_total

In [None]:
for j in range(0,len(sample_sub['text'])):
    print('Similarity of first train text with text {} from test data'.format(j))
    similarity = jaccard_similarity(sample_sub.loc[j,'text'], train['text'][0])
    print(similarity)

In [None]:
for j in range(0,len(sample_sub['text'])):
    print('Similarity of second train text with text {} from test data'.format(j))
    similarity = jaccard_similarity(sample_sub.loc[j,'text'], train['text'][1])
    print(similarity)

### SequenceMatcher from difflib

In [None]:
import difflib

for j in range(0,len(sample_sub['text'])):
    print('Similarity of first train text with text {} from test data'.format(j))
    d = difflib.SequenceMatcher(None, sample_sub.loc[j,'text'], train['text'][0])
    similarity = d.ratio()*100
    print(similarity)

In [None]:
for j in range(0,len(sample_sub['text'])):
    print('Similarity of second train text with text {} from test data'.format(j))
    d = difflib.SequenceMatcher(None, sample_sub.loc[j,'text'], train['text'][1])
    similarity = d.ratio()*100
    print(similarity)

There are several other distance/similarity measures. Many of them available in package like sklean, spacy, nltk etc.

# UPVOTE if you like this notebook.
## Thanks