# Configuration

In [3]:
# Import spaCy
import spacy

from pathlib import Path
from spacy.matcher import Matcher
from spacy.tokens import Doc, Span

In [4]:
# This contains the processing pipeline
# As well as language-specific rules for tokenization etc.
nlp = spacy.load('en_core_web_lg')

# Import text files

In [5]:
moby_dick = Path('../Text Files/moby_dick.txt').read_text(encoding='utf8')
moby_dick = moby_dick.replace('\n', '')
doc_1 = nlp(moby_dick)

ai_forecast_1 = Path('../Text Files/ai_forecast1.txt').read_text(encoding='utf8')
ai_forecast_1 = ai_forecast_1.replace('\n', '')
doc_2 = nlp(ai_forecast_1)

ai_forecast_2 = Path('../Text Files/ai_forecast2.txt').read_text(encoding='utf8')
ai_forecast_2 = ai_forecast_2.replace('\n', '')
doc_3 = nlp(ai_forecast_2)

# Compare the three documents

In [6]:
print('Similarity between {} and {} is: {}\n'.format('moby_dick', 'ai_forecast_1', doc_1.similarity(doc_2)))
print('Similarity between {} and {} is: {}\n'.format('moby_dick', 'ai_forecast_2', doc_1.similarity(doc_3)))
print('Similarity between {} and {} is: {}'.format('ai_forecast_1', 'ai_forecast_2', doc_2.similarity(doc_3)))

Similarity between moby_dick and ai_forecast_1 is: 0.8268866081548163

Similarity between moby_dick and ai_forecast_2 is: 0.8658625494727867

Similarity between ai_forecast_1 and ai_forecast_2 is: 0.9873224403687386


# Compare the first one hundred tokens of each document with each other

In [7]:
doc_1_tokens = doc_1[0:99]
doc_2_tokens = doc_2[0:99]
doc_3_tokens = doc_3[0:99]

print('Similarity between the first one hundred tokens of {} and {} is: {}\n'.format('moby_dick', 'ai_forecast_1', doc_1_tokens.similarity(doc_2_tokens)))
print('Similarity between the first one hundred tokens of {} and {} is: {}\n'.format('moby_dick', 'ai_forecast_2', doc_1_tokens.similarity(doc_3_tokens)))
print('Similarity between the first one hundred tokens of {} and {} is: {}\n'.format('ai_forecast_1', 'ai_forecast_2', doc_2_tokens.similarity(doc_3_tokens)))

Similarity between the first one hundred tokens of moby_dick and ai_forecast_1 is: 0.7518677711486816

Similarity between the first one hundred tokens of moby_dick and ai_forecast_2 is: 0.816913902759552

Similarity between the first one hundred tokens of ai_forecast_1 and ai_forecast_2 is: 0.9578762650489807



# Start from a blank nlp model and add your name to the entities.

In [8]:
nlp_2 = spacy.blank("en")

words = ['My', 'name', 'is', 'Rodžers', 'Ušackis', '.']
spaces = [True, True, True, True, False, False]

doc_4 = Doc(nlp.vocab, words=words, spaces=spaces)
print(doc_4.text)

# 3, 5 is the start and end token of the span, so in this case it takes the 3rd and 4th token - Rodžers Ušackis
span = Span(doc_4, 3, 5, label='PERSON')
print(span.text, span.label_, end='\n\n')

# Add your name to the entities
doc_4.ents = [span]

for ent in doc_4.ents:
    # Print the entity text , it's label and explanation
    print('Named Entity - {}\nEntity Label - {}\nEntity Label Description - {}'.format(ent.text, ent.label_, spacy.explain(ent.label_)))

My name is Rodžers Ušackis.
Rodžers Ušackis PERSON

Named Entity - Rodžers Ušackis
Entity Label - PERSON
Entity Label Description - People, including fictional
