# This notebook is used to download and preprocess the data.

**Import the required Libraries**

In [1]:
import pandas as pd
import numpy as np
import re

**Download the dataset**

In [2]:
!wget https://raw.githubusercontent.com/impresso/CLEF-HIPE-2020/master/data/training-v1.1/en/HIPE-data-v1.1-dev-en.tsv

--2020-04-19 18:32:38--  https://raw.githubusercontent.com/impresso/CLEF-HIPE-2020/master/data/training-v1.1/en/HIPE-data-v1.1-dev-en.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.152.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.152.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1489614 (1.4M) [text/plain]
Saving to: ‘HIPE-data-v1.1-dev-en.tsv’


2020-04-19 18:32:41 (627 KB/s) - ‘HIPE-data-v1.1-dev-en.tsv’ saved [1489614/1489614]



**Opening the dataset as a File and storing it in list format**

In [2]:
with open("Data/HIPE-data-v1.1-dev-en.tsv",'r') as file:
    data = file.readlines()

In [3]:
data

['TOKEN\tNE-COARSE-LIT\tNE-COARSE-METO\tNE-FINE-LIT\tNE-FINE-METO\tNE-FINE-COMP\tNE-NESTED\tNEL-LIT\tNEL-METO\tMISC\n',
 '# language = en\n',
 '# newspaper = sn82014385\n',
 '# date = 1810-01-17\n',
 '# document_id = sn82014385-1810-01-17-a-i0004\n',
 '# segment_iiif_link = https://chroniclingamerica.loc.gov/iiif/2/deu_kedavra_ver01%2Fdata%2Fsn82014385%2F00271740232%2F1810011701%2F0198.jp2/1674,567,703,80/full/0/default.jpg\n',
 'T\tO\tO\tO\tO\tO\tO\t_\t_\t_\n',
 'HE\tO\tO\tO\tO\tO\tO\t_\t_\t_\n',
 'Trustees\tO\tO\tO\tO\tO\tO\t_\t_\t_\n',
 'of\tO\tO\tO\tO\tO\tO\t_\t_\t_\n',
 'Wilmington\tB-org\tO\tB-org\tO\tO\tB-loc\tQ13564053\t_\t_\n',
 'College\tI-org\tO\tI-org\tO\tO\tO\tQ13564053\t_\tEndOfLine|NoSpaceAfter\n',
 '# segment_iiif_link = https://chroniclingamerica.loc.gov/iiif/2/deu_kedavra_ver01%2Fdata%2Fsn82014385%2F00271740232%2F1810011701%2F0198.jp2/1755,610,624,43/full/0/default.jpg\n',
 '.\tO\tO\tO\tO\tO\tO\t_\t_\tNoSpaceAfter\n',
 'ake\tO\tO\tO\tO\tO\tO\t_\t_\t_\n',
 'pleasure\tO

In [4]:
len(data)

33175

**Column names**

In [5]:
columns = data[0].strip(" \n").split("\t")
columns

['TOKEN',
 'NE-COARSE-LIT',
 'NE-COARSE-METO',
 'NE-FINE-LIT',
 'NE-FINE-METO',
 'NE-FINE-COMP',
 'NE-NESTED',
 'NEL-LIT',
 'NEL-METO',
 'MISC']

**MetaData: Newspaper ID, date and document id.**

In [6]:
meta = set()
metaAll = []
for value in data:
    if value.startswith('# newspaper'):
        newspaper = value[14:-1]
        date = data[data.index(value)+1][9:-1]
        documentId = data[data.index(value)+2][16:-1]
        meta.add((newspaper, date, documentId))
        metaAll.append([newspaper, date, documentId])

In [7]:
meta

{('sn82014385', '1810-01-17', 'sn82014385-1810-01-17-a-i0004'),
 ('sn83020874', '1830-03-03', 'sn83020874-1830-03-03-a-i0002'),
 ('sn83025812', '1930-05-16', 'sn83025812-1930-05-16-a-i0001'),
 ('sn83026170', '1820-01-10', 'sn83026170-1820-01-10-a-i0001'),
 ('sn83030483', '1790-03-03', 'sn83030483-1790-03-03-a-i0001'),
 ('sn84020750', '1840-04-18', 'sn84020750-1840-04-18-a-i0002'),
 ('sn84026272', '1800-07-26', 'sn84026272-1800-07-26-a-i0003'),
 ('sn85042404', '1880-01-20', 'sn85042404-1880-01-20-a-i0003'),
 ('sn86063397', '1900-01-16', 'sn86063397-1900-01-16-a-i0003'),
 ('sn88068010', '1890-02-20', 'sn88068010-1890-02-20-a-i0003'),
 ('sn88085488', '1910-02-25', 'sn88085488-1910-02-25-a-i0006'),
 ('sn89058133', '1920-03-25', 'sn89058133-1920-03-25-a-i0001'),
 ('sn91068761', '1960-04-13', 'sn91068761-1960-04-13-a-i0001'),
 ('sn92063852', '1950-04-15', 'sn92063852-1950-04-15-a-i0004')}

**Removing all the records with # symbol**

In [8]:
dtemp = data.copy()

In [9]:
dtemp.pop(0)

'TOKEN\tNE-COARSE-LIT\tNE-COARSE-METO\tNE-FINE-LIT\tNE-FINE-METO\tNE-FINE-COMP\tNE-NESTED\tNEL-LIT\tNEL-METO\tMISC\n'

In [10]:
for i in range(0,3):
  for item in dtemp:
    if '#' in item:
      dtemp.pop(dtemp.index(item))
dtemp[:10]

['T\tO\tO\tO\tO\tO\tO\t_\t_\t_\n',
 'HE\tO\tO\tO\tO\tO\tO\t_\t_\t_\n',
 'Trustees\tO\tO\tO\tO\tO\tO\t_\t_\t_\n',
 'of\tO\tO\tO\tO\tO\tO\t_\t_\t_\n',
 'Wilmington\tB-org\tO\tB-org\tO\tO\tB-loc\tQ13564053\t_\t_\n',
 'College\tI-org\tO\tI-org\tO\tO\tO\tQ13564053\t_\tEndOfLine|NoSpaceAfter\n',
 '.\tO\tO\tO\tO\tO\tO\t_\t_\tNoSpaceAfter\n',
 'ake\tO\tO\tO\tO\tO\tO\t_\t_\t_\n',
 'pleasure\tO\tO\tO\tO\tO\tO\t_\t_\t_\n',
 'in\tO\tO\tO\tO\tO\tO\t_\t_\t_\n']

**Converting the records into a more readable format and then converting it to a Pandas DataFrame**

In [11]:
dtemp = [item.strip(" \n ").split('\t') for item in dtemp]
dataSet = pd.DataFrame(data=dtemp, columns=columns)
dataSet.head()

Unnamed: 0,TOKEN,NE-COARSE-LIT,NE-COARSE-METO,NE-FINE-LIT,NE-FINE-METO,NE-FINE-COMP,NE-NESTED,NEL-LIT,NEL-METO,MISC
0,T,O,O,O,O,O,O,_,_,_
1,HE,O,O,O,O,O,O,_,_,_
2,Trustees,O,O,O,O,O,O,_,_,_
3,of,O,O,O,O,O,O,_,_,_
4,Wilmington,B-org,O,B-org,O,O,B-loc,Q13564053,_,_


**Let's look at the individual breakpoints that were used to tokenize the articles.**

In [12]:
dataSet.MISC.value_counts()

_                             20543
NoSpaceAfter                   4789
EndOfLine|NoSpaceAfter         3711
Partial-0:5                       2
NoSpaceAfter|Partial-0:2          2
Partial--9:6                      1
NoSpaceAfter|Partial--21:3        1
Partial--7:2                      1
Partial--21:7                     1
NoSpaceAfter|Partial--7:7         1
NoSpaceAfter|Partial--6:5         1
Partial-0:12                      1
Partial--5:3                      1
Partial-0:2                       1
NoSpaceAfter|Partial-0:7          1
Partial--43:6                     1
Partial--16:5                     1
Name: MISC, dtype: int64

For the scope of this project, we will simply ignore all the breakpoints except ***_***, ***EndOfLine|NoSpaceAfter*** and ***NoSpaceAfter***.  
To do that, let's create a list to store the breakpoints.  
Also, while inspecting the data there were few greek symbols present that served no pupose. So, we will create another list of special characters that may or may not occur in a document.

In [13]:
breakpointsFiltered = ['NoSpaceAfter','EndOfLine|NoSpaceAfter']
specialCharacters = ['!', ',', '.', '(', ')', '?', '""', '>', '<', '$', '%', '+']

Now that we have our breakpoints and special characters, let's process the data.

In [14]:
def processDoc(tokens, breakPt):
    
    # Variable to store the processed document
    doc = ''
    
    # Now, we will iterate through all the tokens and add them to our document based on the breakpoints. 
    for token, prop in zip(tokens, breakPt):
        
        # If the breakpoint is NoSpaceAfter or EndOfLine or both, the token is joined to the document without
        # adding an extra space.
        if prop in breakpointsFiltered:
            if token in specialCharacters or token.isalnum():
                doc = doc +''.join(token.strip(" \n \\ "))
        
        # If the breakpoint is an underscore("_"), the token is joined with an extra space.
        if prop == '_':
            doc = doc + ''.join(token.strip(" \n \\ ")) + " "
            
    return doc

In [15]:
# To iterate through the individual value of TOKEN and MISC, store them in variables
tokens = dataSet.TOKEN
breakpoints = dataSet.MISC

# Processing the articles.
doc = processDoc(tokens, breakpoints)

In [16]:
doc



In [17]:
len(doc)

130738

### Now that our docment is ready, Let's find out all the Named Entities using SpaCy.

In [23]:
import spacy

In [24]:
spacyObject = spacy.load('en', tagger=False, parser=False, matcher=False)
docNE = spacyObject(doc)

In [25]:
for ent in docNE.ents:
    print('Label: {}, Text: {}'.format(ent.label_, ent.text))

Label: GPE, Text: Wilmington College.ake
Label: NORP, Text: Latin
Label: PERSON, Text: Bigelow
Label: PERSON, Text: Read
Label: CARDINAL, Text: 1
Label: GPE, Text: Wilmington
Label: PERSON, Text: Bigelows
Label: PERSON, Text: Read
Label: NORP, Text: Parents andGuardians
Label: ORG, Text: Board
Label: PERSON, Text: ROBERT HAMILTON
Label: ORG, Text: JOHN RUMSEY.WILLIAM
Label: DATE, Text: 22, 1809 .Bold
Label: PERSON, Text: ben
Label: PERSON, Text: Weightman
Label: ORG, Text: Fells Point
Label: PERSON, Text: Johnson
Label: CARDINAL, Text: Two
Label: NORP, Text: French
Label: ORG, Text: aiGibraltar
Label: DATE, Text: November 23
Label: DATE, Text: 1 80s
Label: GPE, Text: thaton
Label: GPE, Text: Malta
Label: GPE, Text: Biistol
Label: DATE, Text: October
Label: PERSON, Text: Cape Bon
Label: NORP, Text: French
Label: CARDINAL, Text: six
Label: TIME, Text: night
Label: PERSON, Text: Johnson
Label: TIME, Text: the night
Label: PERSON, Text: J.
Label: PERSON, Text: Johnson
Label: CARDINAL, Text