# This notebook is used to download and preprocess the data.

**Import the required Libraries**

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
import spacy

**Download the dataset**

In [2]:
!wget https://raw.githubusercontent.com/impresso/CLEF-HIPE-2020/master/data/training-v1.1/en/HIPE-data-v1.1-dev-en.tsv

--2020-04-19 18:32:38--  https://raw.githubusercontent.com/impresso/CLEF-HIPE-2020/master/data/training-v1.1/en/HIPE-data-v1.1-dev-en.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.152.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.152.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1489614 (1.4M) [text/plain]
Saving to: ‘HIPE-data-v1.1-dev-en.tsv’


2020-04-19 18:32:41 (627 KB/s) - ‘HIPE-data-v1.1-dev-en.tsv’ saved [1489614/1489614]



**Opening the dataset as a File and storing it in list format**

In [2]:
with open("Data/HIPE-data-v1.1-dev-en.tsv",'r') as file:
    data = file.readlines()

In [3]:
data

['TOKEN\tNE-COARSE-LIT\tNE-COARSE-METO\tNE-FINE-LIT\tNE-FINE-METO\tNE-FINE-COMP\tNE-NESTED\tNEL-LIT\tNEL-METO\tMISC\n',
 '# language = en\n',
 '# newspaper = sn82014385\n',
 '# date = 1810-01-17\n',
 '# document_id = sn82014385-1810-01-17-a-i0004\n',
 '# segment_iiif_link = https://chroniclingamerica.loc.gov/iiif/2/deu_kedavra_ver01%2Fdata%2Fsn82014385%2F00271740232%2F1810011701%2F0198.jp2/1674,567,703,80/full/0/default.jpg\n',
 'T\tO\tO\tO\tO\tO\tO\t_\t_\t_\n',
 'HE\tO\tO\tO\tO\tO\tO\t_\t_\t_\n',
 'Trustees\tO\tO\tO\tO\tO\tO\t_\t_\t_\n',
 'of\tO\tO\tO\tO\tO\tO\t_\t_\t_\n',
 'Wilmington\tB-org\tO\tB-org\tO\tO\tB-loc\tQ13564053\t_\t_\n',
 'College\tI-org\tO\tI-org\tO\tO\tO\tQ13564053\t_\tEndOfLine|NoSpaceAfter\n',
 '# segment_iiif_link = https://chroniclingamerica.loc.gov/iiif/2/deu_kedavra_ver01%2Fdata%2Fsn82014385%2F00271740232%2F1810011701%2F0198.jp2/1755,610,624,43/full/0/default.jpg\n',
 '.\tO\tO\tO\tO\tO\tO\t_\t_\tNoSpaceAfter\n',
 'ake\tO\tO\tO\tO\tO\tO\t_\t_\t_\n',
 'pleasure\tO

In [4]:
len(data)

33175

**Column names**

In [5]:
columns = data[0].strip(" \n").split("\t")
columns

['TOKEN',
 'NE-COARSE-LIT',
 'NE-COARSE-METO',
 'NE-FINE-LIT',
 'NE-FINE-METO',
 'NE-FINE-COMP',
 'NE-NESTED',
 'NEL-LIT',
 'NEL-METO',
 'MISC']

**MetaData: Newspaper ID, date and document id.**

In [6]:
meta = set()
metaAll = []
for value in data:
    if value.startswith('# newspaper'):
        newspaper = value[14:-1]
        date = data[data.index(value)+1][9:-1]
        documentId = data[data.index(value)+2][16:-1]
        meta.add((newspaper, date, documentId))
        metaAll.append([newspaper, date, documentId])

In [7]:
meta

{('sn82014385', '1810-01-17', 'sn82014385-1810-01-17-a-i0004'),
 ('sn83020874', '1830-03-03', 'sn83020874-1830-03-03-a-i0002'),
 ('sn83025812', '1930-05-16', 'sn83025812-1930-05-16-a-i0001'),
 ('sn83026170', '1820-01-10', 'sn83026170-1820-01-10-a-i0001'),
 ('sn83030483', '1790-03-03', 'sn83030483-1790-03-03-a-i0001'),
 ('sn84020750', '1840-04-18', 'sn84020750-1840-04-18-a-i0002'),
 ('sn84026272', '1800-07-26', 'sn84026272-1800-07-26-a-i0003'),
 ('sn85042404', '1880-01-20', 'sn85042404-1880-01-20-a-i0003'),
 ('sn86063397', '1900-01-16', 'sn86063397-1900-01-16-a-i0003'),
 ('sn88068010', '1890-02-20', 'sn88068010-1890-02-20-a-i0003'),
 ('sn88085488', '1910-02-25', 'sn88085488-1910-02-25-a-i0006'),
 ('sn89058133', '1920-03-25', 'sn89058133-1920-03-25-a-i0001'),
 ('sn91068761', '1960-04-13', 'sn91068761-1960-04-13-a-i0001'),
 ('sn92063852', '1950-04-15', 'sn92063852-1950-04-15-a-i0004')}

**Removing all the records with # symbol**

In [8]:
dtemp = data.copy()

In [9]:
dtemp.pop(0)

'TOKEN\tNE-COARSE-LIT\tNE-COARSE-METO\tNE-FINE-LIT\tNE-FINE-METO\tNE-FINE-COMP\tNE-NESTED\tNEL-LIT\tNEL-METO\tMISC\n'

In [10]:
for i in range(0,3):
  for item in dtemp:
    if '#' in item:
      dtemp.pop(dtemp.index(item))
dtemp[:10]

['T\tO\tO\tO\tO\tO\tO\t_\t_\t_\n',
 'HE\tO\tO\tO\tO\tO\tO\t_\t_\t_\n',
 'Trustees\tO\tO\tO\tO\tO\tO\t_\t_\t_\n',
 'of\tO\tO\tO\tO\tO\tO\t_\t_\t_\n',
 'Wilmington\tB-org\tO\tB-org\tO\tO\tB-loc\tQ13564053\t_\t_\n',
 'College\tI-org\tO\tI-org\tO\tO\tO\tQ13564053\t_\tEndOfLine|NoSpaceAfter\n',
 '.\tO\tO\tO\tO\tO\tO\t_\t_\tNoSpaceAfter\n',
 'ake\tO\tO\tO\tO\tO\tO\t_\t_\t_\n',
 'pleasure\tO\tO\tO\tO\tO\tO\t_\t_\t_\n',
 'in\tO\tO\tO\tO\tO\tO\t_\t_\t_\n']

**Converting the records into a more readable format and then converting it to a Pandas DataFrame**

In [11]:
dtemp = [item.strip(" \n ").split('\t') for item in dtemp]
dataSet = pd.DataFrame(data=dtemp, columns=columns)
dataSet.head()

Unnamed: 0,TOKEN,NE-COARSE-LIT,NE-COARSE-METO,NE-FINE-LIT,NE-FINE-METO,NE-FINE-COMP,NE-NESTED,NEL-LIT,NEL-METO,MISC
0,T,O,O,O,O,O,O,_,_,_
1,HE,O,O,O,O,O,O,_,_,_
2,Trustees,O,O,O,O,O,O,_,_,_
3,of,O,O,O,O,O,O,_,_,_
4,Wilmington,B-org,O,B-org,O,O,B-loc,Q13564053,_,_


**Let's look at the individual breakpoints that were used to tokenize the articles.**

In [12]:
dataSet.MISC.value_counts()

_                             20543
NoSpaceAfter                   4789
EndOfLine|NoSpaceAfter         3711
NoSpaceAfter|Partial-0:2          2
Partial-0:5                       2
NoSpaceAfter|Partial--6:5         1
Partial--7:2                      1
Partial--43:6                     1
Partial--5:3                      1
Partial-0:12                      1
NoSpaceAfter|Partial--7:7         1
NoSpaceAfter|Partial-0:7          1
Partial--9:6                      1
Partial--21:7                     1
Partial--16:5                     1
NoSpaceAfter|Partial--21:3        1
Partial-0:2                       1
Name: MISC, dtype: int64

For the scope of this project, we will simply ignore all the breakpoints except ***_***, ***EndOfLine|NoSpaceAfter*** and ***NoSpaceAfter***.  
To do that, let's create a list to store the breakpoints.  
Also, while inspecting the data there were few greek symbols present that served no pupose. So, we will create another list of special characters that may or may not occur in a document.

In [17]:
breakpointsFiltered = ['NoSpaceAfter','EndOfLine|NoSpaceAfter']
specialCharacters = ['!', ',', '.', '(', ')', '?', '""', '>', '<', '$', '%', '+']

Now that we have our breakpoints and special characters, let's process the data.

In [42]:
def processDoc(tokens, breakPt):
    
    # Variable to store the processed document
    doc = ''
    
    # Now, we will iterate through all the tokens and add them to our document based on the breakpoints. 
    for token, prop in zip(tokens, breakPt):
        
        # If the breakpoint is NoSpaceAfter or EndOfLine or both, the token is joined to the document without
        # adding an extra space.
        if prop in breakpointsFiltered:
            if token in specialCharacters or token.isalnum():
                doc = doc +''.join(token.strip(" \n \\ ")) + " "
        
        # If the breakpoint is an underscore("_"), the token is joined with an extra space.
        if prop == '_':
            doc = doc + ''.join(token.strip(" \n \\ ")) + " "
            
    return doc

In [43]:
# To iterate through the individual value of TOKEN and MISC, store them in variables
tokens = dataSet.TOKEN
breakpoints = dataSet.MISC

# Processing the articles.
doc = processDoc(tokens, breakpoints)

In [44]:
doc

'T HE Trustees of Wilmington College . ake pleasure in being able to announce to the pubiic the revival of the Latin school in this institution , under the immediate care of Mr . Bigelow and general superin tendance of the Rev . Dr . Read . 1 he heal thy situation of Wilmington , its character for morality , the goodness of its market and choice of best boarding houses , joined o the evidence of Mr . Bigelow s capacity as a teacher , and the long and justly esta hlLhed reputation of Dr . Read as a success ful preceptor , all concur to recommend this seminary to the attention of Parents and Guardians , anxious to promote the educa tion and improvement of the youth intrust ed to their care , espsciailv when they art further assured that she other departments of the College are provided with tu . ois ful ly adequate to their appointments , and u unexceptionable moral characters By order of the Board , ROBERT HAMILTON , EBENK 7 AK A . SMITH , JOHN RUMSEY . WILLIAM rUTCE , J ut v 22 , 1809 

In [45]:
len(doc)

137965

## Now that we have a much more reliable document, let's process it.

### 1. First we will tokenize our text using nltk library

In [75]:
tokenizedDoc = [word for word in nltk.word_tokenize(doc)
                if word.isalpha()]

In [76]:
tokenizedDoc

['T',
 'HE',
 'Trustees',
 'of',
 'Wilmington',
 'College',
 'ake',
 'pleasure',
 'in',
 'being',
 'able',
 'to',
 'announce',
 'to',
 'the',
 'pubiic',
 'the',
 'revival',
 'of',
 'the',
 'Latin',
 'school',
 'in',
 'this',
 'institution',
 'under',
 'the',
 'immediate',
 'care',
 'of',
 'Mr',
 'Bigelow',
 'and',
 'general',
 'superin',
 'tendance',
 'of',
 'the',
 'Rev',
 'Dr',
 'Read',
 'he',
 'heal',
 'thy',
 'situation',
 'of',
 'Wilmington',
 'its',
 'character',
 'for',
 'morality',
 'the',
 'goodness',
 'of',
 'its',
 'market',
 'and',
 'choice',
 'of',
 'best',
 'boarding',
 'houses',
 'joined',
 'o',
 'the',
 'evidence',
 'of',
 'Mr',
 'Bigelow',
 's',
 'capacity',
 'as',
 'a',
 'teacher',
 'and',
 'the',
 'long',
 'and',
 'justly',
 'esta',
 'hlLhed',
 'reputation',
 'of',
 'Dr',
 'Read',
 'as',
 'a',
 'success',
 'ful',
 'preceptor',
 'all',
 'concur',
 'to',
 'recommend',
 'this',
 'seminary',
 'to',
 'the',
 'attention',
 'of',
 'Parents',
 'and',
 'Guardians',
 'anxious'

### 2. Now, we will remove all the stop words; eg:- the, he, where, etc.

In [65]:
from nltk.corpus import stopwords

In [98]:
finalTokens = [token for token in tokenizedDoc
              if token.lower() not in stopwords.words('english')]

In [99]:
finalTokens

['Trustees',
 'Wilmington',
 'College',
 'ake',
 'pleasure',
 'able',
 'announce',
 'pubiic',
 'revival',
 'Latin',
 'school',
 'institution',
 'immediate',
 'care',
 'Mr',
 'Bigelow',
 'general',
 'superin',
 'tendance',
 'Rev',
 'Dr',
 'Read',
 'heal',
 'thy',
 'situation',
 'Wilmington',
 'character',
 'morality',
 'goodness',
 'market',
 'choice',
 'best',
 'boarding',
 'houses',
 'joined',
 'evidence',
 'Mr',
 'Bigelow',
 'capacity',
 'teacher',
 'long',
 'justly',
 'esta',
 'hlLhed',
 'reputation',
 'Dr',
 'Read',
 'success',
 'ful',
 'preceptor',
 'concur',
 'recommend',
 'seminary',
 'attention',
 'Parents',
 'Guardians',
 'anxious',
 'promote',
 'educa',
 'tion',
 'improvement',
 'youth',
 'intrust',
 'ed',
 'care',
 'espsciailv',
 'art',
 'assured',
 'departments',
 'College',
 'provided',
 'tu',
 'ois',
 'ful',
 'ly',
 'adequate',
 'appointments',
 'u',
 'unexceptionable',
 'moral',
 'characters',
 'order',
 'Board',
 'ROBERT',
 'HAMILTON',
 'EBENK',
 'AK',
 'SMITH',
 'JOHN'

### Now that we have our final tokens, Let's find out all the Named Entities using spaCy.
**In order to use the built-in models of spaCy, we'll have to reconstruct our document using the tokens we generated earlier as spaCy does not take tokens as an input.**

In [121]:
import spacy

In [137]:
spacyObject = spacy.load('en', tagger=False, parser=False, matcher=False)
# reconstructing the document
docReconstructed = spacy.tokens.doc.Doc(spacyObject.vocab, 
                             words=finalTokens)
for name, proc in spacyObject.pipeline:
    docReconstructed = proc(docReconstructed)

In [150]:
# Classifying our text.
namedEntities = spacyObject(docReconstructed.text)

In [153]:
entities = set()

In [154]:
# Let's print all the identified entities.
for ent in namedEntities.ents:
    print('Label: {}, Text: {}'.format(ent.label_, ent.text))
    entities.add(ent.label_)

Label: ORG, Text: Wilmington College
Label: NORP, Text: Latin
Label: ORG, Text: Bigelow
Label: PRODUCT, Text: Rev Dr Read
Label: PERSON, Text: Wilmington
Label: PERSON, Text: Bigelow
Label: NORP, Text: Parents Guardians
Label: PERSON, Text: Mr Weight
Label: PRODUCT, Text: Fell Point
Label: PERSON, Text: Johnson
Label: CARDINAL, Text: Two
Label: NORP, Text: French
Label: DATE, Text: November
Label: ORG, Text: Malta Biistol October
Label: PERSON, Text: Cape Bon
Label: NORP, Text: French
Label: CARDINAL, Text: Two
Label: CARDINAL, Text: six
Label: PERSON, Text: Johnson
Label: PERSON, Text: Bounds
Label: ORG, Text: loGibraltar first corning board
Label: PERSON, Text: John son
Label: TIME, Text: morning
Label: PERSON, Text: Mr Whipey
Label: NORP, Text: Ids
Label: PERSON, Text: Johnson
Label: ORG, Text: Cagliari
Label: DATE, Text: quarantine days
Label: DATE, Text: three days
Label: NORP, Text: British
Label: LANGUAGE, Text: Fiench
Label: PERSON, Text: Sudght
Label: ORG, Text: Port Mahon Min

In [155]:
# Set of identified entity types.
entities

{'CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART'}

**Description of each type can be found <a href="https://spacy.io/api/annotation#named-entities" target="blank_">here</a>.**

### Now that we are done with NER, let's find out the most common entity types and entities.

In [144]:
from collections import Counter

In [158]:
# Count of each type of Entity.
labels = [item.label_ for item in namedEntities.ents]
Counter(labels)

Counter({'ORG': 300,
         'NORP': 92,
         'PRODUCT': 39,
         'PERSON': 385,
         'CARDINAL': 118,
         'DATE': 155,
         'TIME': 29,
         'LANGUAGE': 8,
         'GPE': 190,
         'QUANTITY': 16,
         'EVENT': 2,
         'LOC': 16,
         'FAC': 23,
         'MONEY': 13,
         'WORK_OF_ART': 9,
         'ORDINAL': 14,
         'LAW': 1})

In [164]:
# All the named entities with their respective frequencies.
text = [item.text for item in namedEntities.ents]
Counter(text)

Counter({'Wilmington College': 1,
         'Latin': 2,
         'Bigelow': 2,
         'Rev Dr Read': 1,
         'Wilmington': 1,
         'Parents Guardians': 1,
         'Mr Weight': 1,
         'Fell Point': 1,
         'Johnson': 3,
         'Two': 4,
         'French': 7,
         'November': 5,
         'Malta Biistol October': 1,
         'Cape Bon': 1,
         'six': 3,
         'Bounds': 1,
         'loGibraltar first corning board': 1,
         'John son': 1,
         'morning': 3,
         'Mr Whipey': 1,
         'Ids': 1,
         'Cagliari': 1,
         'quarantine days': 1,
         'three days': 1,
         'British': 6,
         'Fiench': 1,
         'Sudght': 1,
         'Port Mahon Minorca': 1,
         'Gibraltar': 1,
         'Sic': 1,
         'Burr Letters Paris': 1,
         'last year': 3,
         'Aurora': 1,
         'U States': 1,
         'Burr': 2,
         'Issued L': 1,
         'United States': 9,
         'Philadelphia': 6,
         'Paris': 3,
    

**Most Common**

In [166]:
# 20 most common named entities.
Counter(text).most_common(20)

[('one', 40),
 ('two', 16),
 ('Tom', 13),
 ('United States', 9),
 ('un', 8),
 ('three', 8),
 ('first', 8),
 ('French', 7),
 ('Baltimore', 7),
 ('British', 6),
 ('Philadelphia', 6),
 ('Republican', 6),
 ('October', 6),
 ('evening', 6),
 ('November', 5),
 ('August', 5),
 ('France', 5),
 ('years', 5),
 ('Missouri', 5),
 ('House', 5)]