## Data Analysis  
Analysis of the given dataset, including bag of word for sentences in which dataset labels are mentioned and N-Grams. 

# Loading the data

In [None]:
import pandas as pd

import json
import re
import os

INPUT_PATH = '/kaggle/input/coleridgeinitiative-show-us-the-data'
WORKING_PATH = '/kaggle/working'

In [None]:
# code of this cell base on https://www.kaggle.com/hsbota/simple-baseline-finding-labels-in-text-hbot
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()


def read_doc(doc_id, split:str = 'train'):
    path = os.path.join(INPUT_PATH, split, f'{doc_id}.json')
    return json.loads(open(path).read())

    
def row_to_doc_text_uncleaned(row: pd.Series, split:str = 'train') -> str:
    
    doc_id = row['Id']
    doc_title = row['pub_title'] if 'pub_title' in row else ''
    
    doc_content = read_doc(doc_id, split)
    doc_text = ' '.join([section['section_title'] + ':. ' + 
                         section['text'] for section in doc_content])
    
    return doc_text

In [None]:
train_df = pd.read_csv(os.path.join(INPUT_PATH, 'train.csv'))
train_df['pub_text'] = train_df.apply(row_to_doc_text_uncleaned, split='train', axis='columns')
print(train_df.nunique())
print("Sample of the dataset, including the raw text")
train_df.head()

In [None]:
print("Amount of unique Datset labels: ", len(train_df['dataset_label'].value_counts().index))
print("Top 30 Mentioned Datasets: ")
dataset_label_counts = train_df['dataset_label'].value_counts(dropna=False)
df_dataset_counts = pd.DataFrame(dataset_label_counts)
df_dataset_counts = df_dataset_counts.reset_index()
df_dataset_counts.columns = ['dataset_label','dataset_frequency'] 
print(df_dataset_counts.head(30))

print("Interessting observation: SARS-CoV-2 genome sequence is treated as a Dataset ")

# Extract sentences that contain the labeled dataset

In [None]:
# text is not cleaned, i.e. punctation, capitals and further are preserved

mentions_sentences = []
mentions_ds = []
mentions_id = []
mentions_index = []
mentions_three_gram = []
mentions_two_gram = []
mentions_one_gram = []
mentions_nthree_gram = []
mentions_ntwo_gram = []
mentions_none_gram = []
not_found_label = []
not_found_id = []


for i in range(len(train_df.index)):
    # split after sentences end (".", "!" or "?" is followed by whitespace and captial letter)
    text = re.split("(?<=[\.\!\?])\s*(?=[A-Z])", train_df['pub_text'][i])
    for sentence in text: 
        # if the according label is substring of the sentence
        if  train_df['dataset_label'][i] in sentence:
            # add sentence,label and Id to database
            mentions_sentences.append(sentence)
            mentions_ds.append(train_df['dataset_label'][i])
            mentions_id.append(train_df['Id'][i])
            
            # create seperate strings for each word
            word_list = re.sub('[\,\;\:\.\!\?]', '', sentence).split()
            dataset_strings = train_df['dataset_label'][i].split()
            indicies = []
            #find indicies of mentioned database in sentence
            for dataset_string in dataset_strings:
                # if words of dataset are found as whole strings
                if dataset_string not in word_list:
                    #see if they can be found as sub strings
                    for idx,string in enumerate(word_list):
                        if  dataset_string in string:
                            indicies.append(idx)
                else:
                    indicies.append(word_list.index(dataset_string))
            # save index of first dataset word 
            mentions_index.append(indicies[0])
            
            # create 3-gram
            if indicies[0]>2:
                three_gram= word_list[indicies[0]-3] + " " +word_list[indicies[0]-2] + " " + word_list[indicies[0]-1]
                mentions_three_gram.append(three_gram)
            else: 
                mentions_three_gram.append(None)
            
            # create 2-gram
            if indicies[0]>1:
                two_gram= word_list[indicies[0]-2] + " " + word_list[indicies[0]-1]
                mentions_two_gram.append(two_gram)
            else: 
                mentions_two_gram.append(None)
                
            # create 1-gram
            if indicies[0]>0:
                one_gram=  word_list[indicies[0]-1]
                mentions_one_gram.append(one_gram)
            else: 
                mentions_one_gram.append(None)
                
            # create -3-gram
            if indicies[-1]<len(word_list)-3:
                nthree_gram= word_list[indicies[-1]+1] + " " +word_list[indicies[-1]+2] + " " + word_list[indicies[-1]+3]
                mentions_nthree_gram.append(nthree_gram)
            else: 
                mentions_nthree_gram.append(None)
            
            # create -2-gram
            if indicies[-1]<len(word_list)-2:
                ntwo_gram= word_list[indicies[-1]+1] + " " +word_list[indicies[-1]+2] 
                mentions_ntwo_gram.append(ntwo_gram)
            else: 
                mentions_ntwo_gram.append(None)
                
            # create -1-gram
            if indicies[-1]<len(word_list)-1:
                none_gram= word_list[indicies[-1]+1] 
                mentions_none_gram.append(none_gram)
            else: 
                mentions_none_gram.append(None)

            
mentions_dict = dict([('sentences',  mentions_sentences), ('dataset_labels', mentions_ds), ('id',mentions_id), ('index',mentions_index),('3-Gram', mentions_three_gram), ('2-Gram', mentions_two_gram), ('1-Gram', mentions_one_gram),('-3-Gram', mentions_nthree_gram), ('-2-Gram', mentions_ntwo_gram), ('-1-Gram', mentions_none_gram)])
mentions_df = pd.DataFrame(mentions_dict, columns = ['sentences','dataset_labels','id','index', '3-Gram','2-Gram','1-Gram','-3-Gram','-2-Gram','-1-Gram'])
mentions_df.head()

# Example sentences that mention dataset label 

In [None]:
pd.options.display.max_colwidth = 500
mentions_df.sample(n = 15)[['sentences', 'dataset_labels']]

# Bag of Words for all words in extracted sentences

In [None]:
cleaned = ''.join([clean_text(sentence) for sentence in mentions_sentences])
word_list = cleaned.split()    
word_dict= dict([('word', word_list)])
word_df= pd.DataFrame (word_dict, columns = ['word'])

In [None]:
print("Words in mention sentences: ",len(word_df.index))
word_counts = word_df['word'].value_counts(dropna=False)
df_word_counts = pd.DataFrame(word_counts)
df_word_counts = df_word_counts.reset_index()
df_word_counts.columns = ['unique_words', 'word_frequency'] 
print("Unique words: ", len(df_word_counts.index))
df_word_counts.head(30)

# Dataset labels and N-Grams 

In [None]:
mentions_df.info()

In [None]:
print("Amount of papers searched: ", len(train_df.index)," \n")
print("Amount of mention sentences: ", len(mentions_df.index)," \n")
print("Top 10 Dataset_labels in mention sentences: ")
print(mentions_df['dataset_labels'].value_counts().nlargest(10)," \n" )
print("Top 10 Index of first word of Dataset: ")
print(mentions_df['index'].value_counts().nlargest(10)," \n")
print("Top 20 3_Grams:")
print(mentions_df['3-Gram'].value_counts().nlargest(20)," \n")
print("Top 20 2_Grams:")
print(mentions_df['2-Gram'].value_counts().nlargest(20)," \n")
print("Top 20 1_Grams:")
print(mentions_df['1-Gram'].value_counts().nlargest(20)," \n")
print("Top 20 -3_Grams:")
print(mentions_df['-3-Gram'].value_counts().nlargest(20)," \n")
print("Top 20 -2_Grams:")
print(mentions_df['-2-Gram'].value_counts().nlargest(20)," \n")
print("Top 20 -1_Grams:")
print(mentions_df['-1-Gram'].value_counts().nlargest(20)," \n")





In [None]:
#save mentions dataframe as cvs
mentions_path = os.path.join(WORKING_PATH, 'mentions.csv')
words_path =  os.path.join(WORKING_PATH, 'bag_of_words.csv')
mentions_df.to_csv(mentions_path)
word_df.to_csv(words_path)