<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

# Create, clean and label training dataset
With 5878 featured articles scrapped from Wikipedia. Here, I am creating sentences based labeled training dataset.
- First reading text file in csv format.
- After cleaning text, separating cited sentences (positive label) and non-cited sentences (negative label).
- Save the Negative label dataframe
- Positive sentences need to be further separated from non-cited sentences. For this purpose, we need to separate boundary at
   bracket so I used custom sentence segmentation from Spacy. 
- Save the Positive label dataframe 
- Now I combine together Negative and Positive dataframes to obtain the training dataset.  

In [1]:
# import packages
import pandas as pd
import numpy as np
import unicodedata
import sklearn.utils
import random
import os
import warnings 
warnings.filterwarnings('ignore')
%matplotlib inline
import re
import glob
from pathlib import Path
import matplotlib.pyplot as plt
from util import df_tolist, create_label, string_limit, unicode_text, text_cleaning


In [2]:
# There are 5978 Wikipedia featured articles. Creating training dataset. 
# directory csv files
full_path = 'E:/Sharpest_Mind/WikipediaCitation/data/processed/csv_files/Context_sentences_traindata'
csvs = [x for x in os.listdir(full_path) if x.endswith('.csv')]
#print(csvs)
# stats.csv -> stats
fns = [os.path.splitext(os.path.basename(x))[0] for x in csvs]
#print(fns)
d = {}
for i in range(len(fns)):
    d[fns[i]] = pd.read_csv(full_path+'/'+csvs[i], encoding = 'iso-8859-1')

In [3]:
for k, v in d.items():
    print(k)

Citation_needed_wiki
data0_1000links
data2001_4000links
data4001_5000links
datat5001_5978links


In [4]:
d['data0_1000links']

Unnamed: 0,Statements
0,7 World Trade Center (7 WTC or WTC-7) refers t...
1,The original 7 World Trade Center was 47 stori...
2,"On September 11, 2001, the structure was subst..."
3,Construction of the new 7 World Trade Center b...
4,The original 7 World Trade Center was a 47-sto...
...,...
37224,"The following year, the Mint hired Saint-Gaude..."
37225,Large quantities of double eagles were melted ...
37226,"In the 19th century, double eagles were little..."
37227,As it became clear in the 1940s that the withd...


In [5]:
# From 0-1000 links
df1 = d['data0_1000links']
df1 = string_limit(df1)
df1 = create_label(df1)
df1 = unicode_text(df1)
print(df1['label'].value_counts())
df1.tail()

1    29145
0     2714
Name: label, dtype: int64


Unnamed: 0,Statements,label
287,There is only limited evidence for use after t...,1
12301,The four-spored basidia typically measure 243...,1
6063,In addition to exhibitions of the Four Freedom...,1
4434,Altar 28 is located near Structure 10 in the C...,1
26918,"Larger than most Australian frogs, the Austral...",0


In [6]:
# From 2001-14000 links
df2 = d['data2001_4000links']
df2 = string_limit(df2)
df2 = create_label(df2)
df2 = unicode_text(df2)
print(df2['label'].value_counts())
df2.tail()

1    68701
0     7798
Name: label, dtype: int64


Unnamed: 0,Statements,label
33484,"In Virginia, Allison produced light rainfall, ...",1
25514,"Isaacs found ""Say Hello to My Little Friend"" t...",1
261,The Constitution introduces separation of powe...,1
51152,The significantly higher density of Enceladus ...,1
39773,"""Wanna Be Startin' Somethin'"" climaxes in an A...",1


In [7]:
# From 4001-5000 links
df3 = d['data4001_5000links']
df3 = string_limit(df3)
df3 = create_label(df3)
df3 = unicode_text(df3)
print(df3['label'].value_counts())
df3.tail()

1    31974
0     3033
Name: label, dtype: int64


Unnamed: 0,Statements,label
1599,"In appearance, Helena was described by John Va...",1
34654,"Meanwhile, Sega released the 32X on November 2...",1
34845,Sonic Team felt challenged by the new hardware...,1
31948,"Kratos surprises Zeus, seizes the Blade of Oly...",1
1606,Considered by contemporaries to be a harsh but...,0


In [9]:
# From 4001-5000 links
df4 = d['data4001_5000links']
df4 = string_limit(df4)
df4 = create_label(df4)
df4 = unicode_text(df4)
print(df4['label'].value_counts())
df4.tail()

1    31974
0     3033
Name: label, dtype: int64


Unnamed: 0,Statements,label
34941,"After the uneventful Saturn era, the series fo...",1
32278,While no new locales were introduced in Episod...,1
18181,"Hewett finished the season with 1,047 runs for...",1
23247,The Royal Automobile Club campaigned vigorousl...,1
21437,"That year, a song about Robinson by Buddy John...",1


In [11]:
# From 5001-5978 links
df5 = d['datat5001_5978links']
df5 = string_limit(df5)
df5 = create_label(df5)
df5 = unicode_text(df5)
print(df5['label'].value_counts())
df5.tail()

1    26177
0     2768
Name: label, dtype: int64


Unnamed: 0,Statements,label
10672,"As a result of the outbreak of World War I, Ka...",1
4915,Canada was renamed Almirante Latorre once agai...,1
5436,A cannon is a large-caliber gun classified as ...,0
10786,The wreck at some point came into the ownershi...,1
10418,Her main armored belt was 350 mm (13.8 in) thi...,1


In [19]:
# dataframe with all positive examples

df_positive = pd.concat([df1[df1['label']=='1'][:4000], df2[df2['label']=='1'][:4000], df3[df3['label']=='1'][:4000], 
                         df4[df4['label']=='1'][:4000], df5[df5['label']=='1'][:4000]], ignore_index=True, sort=False)
print(df_positive.shape)
# Cleaning and preprocessing the text 
df_positive['Statements'] = text_cleaning(df_positive['Statements'])
df_positive.to_csv('df_positive1.csv', index=False,  header=None,  sep=' ', mode='a')
df_positive.head()

(20000, 2)


Unnamed: 0,Statements,label
0,The modern search for the Mary Rose was initia...,1
1,"In the early 17th century, Richard Sackville, ...",1
2,"Plateosaurus gracilis, the older species, is f...",1
3,"Although the pharaoh delegated his authority, ...",1
4,"Mammoths continued growing during adulthood, a...",1


In [20]:
#dataframe with all negative examples
df_negative = pd.concat([df1[df1['label']=='0'], df2[df2['label']=='0'], df3[df3['label']=='0'], df4[df4['label']=='0'], df5[df5['label']=='0']], ignore_index=True, sort=False)
print(df_negative.shape)
# Cleaning and preprocessing the text 
df_negative['Statements'] = text_cleaning(df_negative['Statements'])
df_negative.to_csv('df_negative1.csv', index=False,  header=None,  sep=' ', mode='a')
df_negative.head()

(19346, 2)


Unnamed: 0,Statements,label
0,It seems that the kakapo  like many of New Ze...,0
1,"Small for a sauropod, Nigersaurus was about 9 ...",0
2,"When the watchman on the wall, the Shieldings'...",0
3,Velázquez uses this light not only to add vol...,0
4,The grey-cowled wood rail or grey-necked wood ...,0


# Tokenize positive labels to sentence using spacy tokenizer. 

In [None]:
import re
import unicodedata

# create list with 1000000 batch size. Spacy doc charecters limit
file_open = open('E:/Sharpest_Mind/WikipediaCitation/data/processed/df_positive.prn', 'r', encoding='UTF-8',errors = 'ignore' )
scraped_text = file_open.read()
re.sub(r"\n", " ", scraped_text)   # remove newline charecter
unicodedata.normalize("NFKD",scraped_text)   # encoding special charecters
file_open.close()
total_count = len(scraped_text)
chunks = (total_count - 1) // 1000000 + 1
text_batches =[]
for i in range(chunks):
    batch = scraped_text[i*1000000:(i+1)*1000000]
    
    text_batches.append(batch)
    

In [None]:
len(text_batches)

In [None]:
# sentence tokenization 
import spacy
import pandas as pd

def custom_sentence_boundary(doc):
    #  function to split sentences at the end of citation bracket and no splitting at some other charecters
    for i, token in enumerate(doc):
        if token.text == ']' :
            doc[i+1].sent_start = True
            
    return doc

def sentence_tokenization(text_batches):
    nlp = spacy.load('en_core_web_sm')  
    nlp.add_pipe(custom_sentence_boundary, before='parser')

    sents_list = []    
    
    for elem in range(len(text_batches[0:100])):
        texts =str(text_batches[elem])
        texts= re.sub(r"\n", "", texts)   # remove newline charecter
        texts = re.sub("(<br/>)", "", texts)
        texts = re.sub('(<a).*(>).*(</a>)', '', texts)
        texts = re.sub('(&amp)', '', texts)
        texts = re.sub('(&gt)', '', texts)
        texts = re.sub('(&lt)', '', texts)
        texts = re.sub('(\xa0)', ' ', texts)  
        texts = unicodedata.normalize("NFKD",texts)   # encoding special charecters
        doc = nlp(texts)

        for sent in doc.sents:
            sents_list.append(sent.text)
    df = pd.DataFrame(sents_list, columns=['text'])
    df.to_csv('sents_list.csv', index=False)
    
    return sents_list

In [None]:
sent_token = sentence_tokenization(text_batches)

In [None]:
len(sent_token)

In [None]:
sent_token[:10]

In [None]:
df_new = pd.DataFrame({'Statements':sent_token})


In [None]:
df_new.to_csv('Positive0_100.csv', index=False)

In [None]:
# From 5001-5978 links  - select positive labels
#df_new= pd.read_csv('final_list.csv', encoding ='utf8')
df_new = string_limit(df_new)
df_new = create_label(df_new)
df_new = unicode_text(df_new)
print(df_new['label'].value_counts())
df_new.tail()

In [None]:
df_pos = df_new[df_new['label']=='1']

In [None]:
df_pos.head()

In [None]:
df_pos = df_pos[(df_pos['Statements'].str.split().str.len() >25)].reset_index(drop=True)
df_pos.head()

In [None]:
df_pos.shape

In [None]:
df_pos.head()

In [None]:
df_negative.shape

In [None]:
df_negative.head()

In [None]:
# Combinig positive and negative lables and creating final trainigg data. 
df_final =pd.concat([df_pos, df_negative],  ignore_index=True, sort=False)

In [None]:
df_final.shape

In [None]:
df_final.tail()

In [None]:
# labeled train dataset based on feature articles 
df_pos.to_csv('Positive_label.csv', index=False)