**All necessary libraries**

In [None]:
import json
import os
import warnings
import numpy as np
import pandas as pd
import glob
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re
import string
from wordcloud import WordCloud, STOPWORDS
import seaborn as sns
warnings.filterwarnings("ignore")

# Lets understand the data #

1. The objective of the competition is to identify the mention of datasets within scientific publications.
2. The predictions will be short excerpts from the publications that appear to note a dataset.

**Files present in our dataset**

* train - the full text of the training set's publications in JSON format, broken into sections with section titles
* test - the full text of the test set's publications in JSON format, broken into sections with section titles
* train.csv - labels and metadata for the training set
* sample_submission.csv - a sample submission file in the correct format


In [None]:
all_dir = os.listdir("../input/coleridgeinitiative-show-us-the-data")
print(all_dir)
train_path = "../input/coleridgeinitiative-show-us-the-data/train"
test_path = "../input/coleridgeinitiative-show-us-the-data/train"
sub_path = "../input/coleridgeinitiative-show-us-the-data/sample_submission.csv"
train_path = "../input/coleridgeinitiative-show-us-the-data/train.csv"

**columns in csv files**

* id - publication id - note that there are multiple rows for some training documents, indicating multiple mentioned datasets
* pub_title - title of the publication (a small number of publications have the same title)
* dataset_title - the title of the dataset that is mentioned within the publication
* dataset_label - a portion of the text that indicates the dataset
* cleaned_label - the dataset_label, as passed through the clean_text function from the Evaluation page


In [None]:
train_df = pd.read_csv(train_path)  # reading csv file
train_df.head(5) # get the first 5 rows

In [None]:
with open("../input/coleridgeinitiative-show-us-the-data/train/0007f880-0a9b-492d-9a58-76eb0b0e0bd7.json") as f:
    sample = json.load(f)
sample[:2]

**** main aim of the competition is to get the dataset label for the particular publication which is provied in json format and use the same dataset labels to get prediction for the test dataset ****

In [None]:
duplicate_df = train_df[train_df['Id'] == "170113f9-399c-489e-ab53-2faf5c64c5bc"].drop_duplicates("dataset_title")
duplicate_df

**from above it is clear that each id has more than one dataset labels which may also belong to different Id**

In [None]:
train_df.describe()  #get detail information about id, publication title, dataset title, dataset label.

 **let us check the Id having different dataset labels**

In [None]:
dataset_title = train_df.groupby('Id').count()[['dataset_title']].sort_values(by = "dataset_title", ascending = False)
id_pub_title = dataset_title[dataset_title['dataset_title'] >1][['dataset_title']].reset_index()

In [None]:
plt.figure(figsize = (13,13))
sns.barplot(id_pub_title['dataset_title'].iloc[:20], id_pub_title['Id'].iloc[:20])
plt.title("dataset titles vs publication")
plt.xticks(fontsize=13)
plt.yticks(fontsize=13)
plt.ylabel("")
plt.xlabel("Count", fontsize=14)

**Similarly we can get for the publication title vs dataset title**

In [None]:
pub_title = train_df.groupby('pub_title').count()[['dataset_title']].sort_values(by = ['dataset_title'], ascending = False)
id_title = pub_title.reset_index()
id_title

In [None]:

plt.figure(figsize = (13,13))
sns.barplot(id_title['dataset_title'].iloc[:20], id_pub_title['Id'].iloc[:20])
plt.title("dataset titles vs publication titles")
plt.xticks(fontsize=13)
plt.yticks(fontsize=13)
plt.ylabel("")
plt.xlabel("Count", fontsize=14)

**Now check how many different dataset title are present with different label**

In [None]:
data_title = train_df.groupby('dataset_title').count()[['dataset_label']].sort_values(by = ['dataset_title'], ascending = False)
id_title = pub_title.reset_index()
id_title

In [None]:

plt.figure(figsize = (13,13))
sns.barplot(id_title['dataset_title'].iloc[:20], id_pub_title['Id'].iloc[:20])
plt.title("dataset titles vs dataset label")
plt.xticks(fontsize=13)
plt.yticks(fontsize=13)
plt.ylabel("")
plt.xlabel("Count", fontsize=14)

In [None]:
train_df.sample(5) # some  of the random samples

# Word cloud representation

In [None]:
stopwords = set(STOPWORDS)
wordcloud = WordCloud(background_color='white',
                      stopwords=stopwords,
                      max_words=100,
                      max_font_size=30,
                      scale=3,
                      random_state=1)
   
wordcloud=wordcloud.generate(str(train_df['dataset_title'].unique()))
fig = plt.figure(1, figsize=(12, 12))
plt.axis('off')
plt.imshow(wordcloud)
plt.show()

**word with larger size are repeated the most in the dataset title
similary you can plot the sane wordcloud for the text data**

**Now lets concat the text in json file with our train csv file**

In [None]:
def get_text(filename, test=False):
    if test:
        df = pd.read_json('../input/coleridgeinitiative-show-us-the-data/test/{}.json'.format(filename))
    else:
        df = pd.read_json('../input/coleridgeinitiative-show-us-the-data/train/{}.json'.format(filename))
    text = " ".join(list(df['text']))
    return text

In [None]:
train_df['text'] = train_df['Id'].apply(get_text)
train_df.sample(5)

# Text Preprocessing(optional)

***lower casing the text***

In [None]:
train_df['lower'] = train_df['text'].str.lower()

In [None]:
train_df

***removing punctuation***


we also need to remove the punctuation symbols from the text.

punctuation in python contains the following punctuation symbols

!"#$%&\'()*+,-./:;<=>?@[\\]^_{|}~`

We can add or remove more punctuations as per our need.

In [None]:
PUNCT_TO_REMOVE = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\n'
def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

train_df["text_wo_punct"] = train_df["lower"].apply(lambda text: remove_punctuation(text))
train_df

In [None]:
train_df.to_csv('train_df.csv', index = False)

***removing stopwords***

In [None]:
#Stopwords = list(stopwords.words('english'))
from nltk.corpus import stopwords
Stopwords = list(stopwords.words('english'))


In [None]:
Stopwords = list(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in Stopwords])

train_df["text_wo_stop"] = train_df["text_wo_punct"].apply(lambda text: remove_stopwords(text))
train_df.head()

In [None]:
str1 = train_df['text_wo_stop'][0]
str2 = train_df['text'][0]
#results
print(str1[:250])
print(str2[:250])

# String Matching

Now we have to get all the dataset titles from the csv file and use the same dataset titles for getting predictions over test files

In [None]:
test_files = os.listdir('../input/coleridgeinitiative-show-us-the-data/test')
test = pd.DataFrame({'Id':test_files})
test['Id'] = test['Id'].apply(lambda x : x.split('.')[0])
test['text'] = test['Id'].apply(get_text, test=True)

In [None]:
test

In [None]:
titles = [x.lower() for x in set(train_df['dataset_title'].unique()).union(set(train_df['dataset_label'].unique()))]


In [None]:
#to clean the text data
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

In [None]:
submission_df = pd.read_csv("../input/coleridgeinitiative-show-us-the-data/sample_submission.csv")

In [None]:
#matching the string
labels = []
for index in submission_df['Id']:
    pub_text = test[test['Id'] == index].text.str.cat(sep = '\n').lower()
    #print(pub_text)
    label = []
    for data_title in titles:
        if data_title in pub_text:
            label.append(clean_text(data_title))
            
            
    labels.append("|".join(label))

In [None]:
labels

In [None]:
submission_df['PredictionString'] = labels

In [None]:
submission_df

In [None]:
submission_df.to_csv('submission.csv', index = False)

**Thankyou for having patience and reading my notebook
please upvote if you understood**

**Credits:**
https://www.kaggle.com/anthokalel/coleridge-complete-eda