## Sentences dataframe for coleridge NER competition  
This notebook creates a sentences df starting from the section extended dataset created [here](https://www.kaggle.com/davidemariani/coleridge-ner-extended-df)  
Please upvote if you find this useful!

In [None]:
import numpy as np 
import pandas as pd 
import json
import seaborn as sns
import re
import nltk

from nltk.tokenize import sent_tokenize, word_tokenize

In [None]:
#MAIN SETTINGS
max_sentence_length = 60 #max n. of words for each slice of text
overlap = 20 #number of overlapping words in case a sentence is broken in more sentences

In [None]:
train_df = pd.read_pickle("../input/coleridge-ner-full-info-df/coleridge_train_extended.pkl")

train_df = train_df[['Id', 'pub_title', 'dataset_title', 'dataset_label',
       'cleaned_label', 'section_title', 'section_number', 'text', 'cleaned_text',
       'label_match']]

In [None]:
#removing rows with null text
print(train_df.shape)
train_df = train_df.dropna(subset = ['text'])
print(train_df.shape)

In [None]:
labels_by_id = train_df.groupby(['Id', 'text']).dataset_label.unique()
labels_by_id_cleaned = train_df.groupby(['Id', 'text']).cleaned_label.unique()

In [None]:
labels_by_id_list = []
labels_by_id_list_cleaned = []

for i in range(train_df.shape[0]):
    labels_by_id_list.append(labels_by_id[train_df.iloc[i].Id, train_df.iloc[i].text])
    labels_by_id_list_cleaned.append(labels_by_id_cleaned[train_df.iloc[i].Id, train_df.iloc[i].text])

In [None]:
#column with the list of dataset included in the paper
train_df['dataset_label_in_id'] = labels_by_id_list
train_df['dataset_label_in_id_cleaned'] = labels_by_id_list_cleaned

In [None]:
#removing duplicate texts
train_df = train_df.drop_duplicates('text')
train_df = train_df.reset_index()

In [None]:
#column with the list of dataset included in the specific text
train_df['dataset_label_in_text'] = train_df.apply(lambda x:[j for j in x.dataset_label_in_id if j in x.text], axis=1)
train_df['dataset_label_in_text_cleaned'] = train_df.apply(lambda x:[j for j in x.dataset_label_in_id_cleaned if j in x.cleaned_text], axis=1)

In [None]:
print("There are {} texts with entities over {} - about {}%".format(train_df.label_match.sum(), train_df.shape[0], round(train_df.label_match.sum()*100/ train_df.shape[0])))

In [None]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower())

In [None]:
def break_sentence(sentence, max_sentence_length, overlap):
    
    words = sentence.split()
    
    sentence_length = len(words)
    
    if sentence_length <= max_sentence_length:
        return [sentence]
    
    else:
        broken_sentences = []
        
        for p in range(0, sentence_length, max_sentence_length - overlap):
            broken_sentences.append(" ".join(words[p:p + max_sentence_length]))
            
        return broken_sentences

In [None]:
#creating sentences dataframe
s_dict = {}

s_dict['Id'] = []
s_dict['sentence_id'] = []
s_dict['pub_title'] = []
s_dict['text'] = []
s_dict['cleaned_text'] = []
s_dict['section_title'] = []
s_dict['section_number'] = []
s_dict['dataset_label_in_id'] = []
s_dict['dataset_label_in_id_cleaned'] = []
s_dict['dataset_label_in_text'] = []
s_dict['dataset_label_in_text_cleaned'] = []
s_dict['has_reference'] = []
s_dict['has_reference_cleaned'] = []
s_dict['n_words'] = []


current_id = ''

for t in range(train_df.shape[0]):
    slice_df = train_df.iloc[t]
    
    pub_id = slice_df.Id
    
    if current_id != pub_id:
        count = 1
        current_id = pub_id
    
    for sup_s in sent_tokenize(slice_df.text):
        
        for s in break_sentence(sup_s, max_sentence_length, overlap):
        
            s_dict['Id'].append(pub_id)
            s_dict['pub_title'].append(slice_df.pub_title)
            s_dict['text'].append(s)

            if count < 10:
                strcount = "000" + str(count)

            elif 10 <= count < 100:
                strcount = "00" + str(count)

            elif 100 <= count < 1000:
                strcount = "0" + str(count)

            else:
                strcount = str(count)

            s_dict['sentence_id'].append(slice_df.Id + '_' + strcount)

            c_text = clean_text(s)
            s_dict['cleaned_text'].append(c_text)

            s_dict['section_title'].append(slice_df.section_title)
            s_dict['section_number'].append(slice_df.section_number)
            s_dict['dataset_label_in_id'].append(slice_df.dataset_label_in_id)
            s_dict['dataset_label_in_id_cleaned'].append(slice_df.dataset_label_in_id_cleaned)

            ds_matches = []
            if len(slice_df.dataset_label_in_text) > 0:
                for ds in slice_df.dataset_label_in_text:
                    if ds in s:
                        ds_matches.append(ds)

            ds_matches_cleaned = []
            if len(slice_df.dataset_label_in_text_cleaned) > 0:
                for ds_c in slice_df.dataset_label_in_text_cleaned:
                    if ds_c in c_text:
                        ds_matches_cleaned.append(ds_c)

            s_dict['dataset_label_in_text'].append(ds_matches)
            s_dict['dataset_label_in_text_cleaned'].append(ds_matches_cleaned)

            s_dict['has_reference'].append(len(ds_matches) > 0)
            s_dict['has_reference_cleaned'].append(len(ds_matches_cleaned) > 0)

            s_dict['n_words'].append(len(s.split()))

            count+=1

        
        
        

In [None]:
sentence_df = pd.DataFrame(s_dict)

In [None]:
sentence_df.head()

In [None]:
sentence_df.to_pickle("coleridge_sentence_df.pkl") #.to_csv("coleridge_sentence_df.csv")