In [1]:
#NLP is a subfield of artificial intelligence, and it’s all about allowing computers to comprehend human language. NLP involves analyzing, quantifying, understanding, and deriving meaning from natural languages.

In [2]:
## Load a book
enc='utf-8'
with open("Lincoln_Second_Inaugural.txt","r",encoding=enc) as f:
    book=f.read()
    #print(book)

In [3]:
## Import spacy
import spacy

## Pipelines are trained on large datasets of labeled example texts.
## Create (nlp) pipeline (tok2vec, tagger, parser, attribute_ruler, lemmatizer, ner)
## by loading small / medium/ large/ transformer pipeline package
## The package provides the binary weights that enable spaCy to make predictions. 
## includes the vocabulary, meta information about the pipeline and the configuration file used to train it.
nlp = spacy.load("en_core_web_sm")
# nlp = spacy.load("en_core_web_md")
# nlp = spacy.load("en_core_web_lg")
# nlp = spacy.load("en_core_web_trf")

## returns a language object containing all components and data needed to process text
nlp

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [75]:
## check pipeline
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x236f41ed9d0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x236f41eec30>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x236ed34cd60>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x236ed61df90>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x236ed5b0790>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x236ed34cf90>)]

In [78]:
## process text with nlp object to create a doc object
doc=nlp(book)

## sentence detection
for sentence in doc.sents:
    print(sentence)

Fellow countrymen: at this second appearing to take the oath of the presidential office there is less occasion for an extended address than there was at the first.
Then a statement somewhat in detail of a course to be pursued seemed fitting and proper.


With malice toward none with charity for all with firmness in the right as God gives us to see the right let us strive on to finish the work we are in to bind up the nation's wounds, to care for him who shall have borne the battle and for his widow and his orphan ~ to do all which may achieve and cherish a just and lasting peace among ourselves and with all nations.


In [2]:
## tokenization breaks a text down into its basic units—or tokens—which are represented in spaCy as Token objects.
doc=nlp(book)
type(doc)

for token in doc:
    print(token.text, token.idx)                       #.idx is starting posiiton in doc

NameError: name 'book' is not defined

In [220]:
## Span object is a slice of the document consisting of one or more tokens. 
span=doc[0:7]
span

Fellow countrymen: at this second appearing

In [222]:
## token attributes
print(f"{"Token":25}",f"{"Is Alphanumeric?":25}",f"{"Is Punctuation?":25}",f"{"Like Number?":25}")

for token in doc:
    print(f"{str(token):25}",f"{str(token.i):25}",f"{str(token.is_alpha):25}",f"{str(token.is_punct):25}",f"{str(token.like_num):25}")

Token                     Is Alphanumeric?          Is Punctuation?           Like Number?             
Fellow                    0                         True                      False                     False                    
countrymen                1                         True                      False                     False                    
:                         2                         False                     True                      False                    
at                        3                         True                      False                     False                    
this                      4                         True                      False                     False                    
second                    5                         True                      False                     True                     
appearing                 6                         True                      False                     False       

In [224]:
##Stop words
from spacy.lang.en.stop_words import STOP_WORDS

print([token for token in doc if not (token.is_stop or token.is_punct)])

[Fellow, countrymen, second, appearing, oath, presidential, office, occasion, extended, address, statement, somewhat, detail, course, pursued, fitting, proper, 

, malice, charity, firmness, right, God, gives, right, let, strive, finish, work, bind, nation, wounds, care, shall, borne, battle, widow, orphan, ~, achieve, cherish, lasting, peace, nations]


In [226]:
##Lemmatisation
for token in doc:
    if str(token)!=str(token.lemma_):
        print(token,"|",token.lemma_)

Fellow | fellow
countrymen | countryman
appearing | appear
is | be
was | be
Then | then
pursued | pursue
seemed | seem
With | with
gives | give
us | we
us | we
are | be
wounds | wound
him | he
borne | bear
lasting | last
nations | nation


In [228]:
## predicting parts of speech (POS) tags from trained pipeline package
for token in doc:
    print(token,"|",token.tag_,"|",spacy.explain(token.tag_),"|",token.pos_,"|",spacy.explain(token.pos_))

Fellow | JJ | adjective (English), other noun-modifier (Chinese) | ADJ | adjective
countrymen | NNS | noun, plural | NOUN | noun
: | : | punctuation mark, colon or ellipsis | PUNCT | punctuation
at | IN | conjunction, subordinating or preposition | ADP | adposition
this | DT | determiner | DET | determiner
second | JJ | adjective (English), other noun-modifier (Chinese) | ADJ | adjective
appearing | VBG | verb, gerund or present participle | VERB | verb
to | TO | infinitival "to" | PART | particle
take | VB | verb, base form | VERB | verb
the | DT | determiner | DET | determiner
oath | NN | noun, singular or mass | NOUN | noun
of | IN | conjunction, subordinating or preposition | ADP | adposition
the | DT | determiner | DET | determiner
presidential | JJ | adjective (English), other noun-modifier (Chinese) | ADJ | adjective
office | NN | noun, singular or mass | NOUN | noun
there | EX | existential there | PRON | pronoun
is | VBZ | verb, 3rd person singular present | VERB | verb
less |

In [230]:
## predicting SYNTECTIC DEPENDENCIES from trained pipeline package
##The .dep_ attribute returns the predicted dependency label.
##The .head attribute returns the syntactic head token. You can also think of it as the parent token this word is attached to.

for token in doc:
    print(token.text,"|",token.pos_,"|",token.dep_,"|",spacy.explain(token.dep_),"|",token.head.text)

Fellow | ADJ | amod | adjectival modifier | countrymen
countrymen | NOUN | dep | unclassified dependent | is
: | PUNCT | punct | punctuation | is
at | ADP | prep | prepositional modifier | is
this | DET | det | determiner | second
second | ADJ | pobj | object of preposition | at
appearing | VERB | pcomp | complement of preposition | at
to | PART | aux | auxiliary | take
take | VERB | xcomp | open clausal complement | appearing
the | DET | det | determiner | oath
oath | NOUN | dobj | direct object | take
of | ADP | prep | prepositional modifier | oath
the | DET | det | determiner | office
presidential | ADJ | amod | adjectival modifier | office
office | NOUN | pobj | object of preposition | of
there | PRON | expl | expletive | is
is | VERB | ROOT | root | is
less | ADJ | amod | adjectival modifier | occasion
occasion | NOUN | attr | attribute | is
for | ADP | prep | prepositional modifier | occasion
an | DET | det | determiner | address
extended | ADJ | amod | adjectival modifier | addr

In [232]:
#from spacy import displacy

#doc1=nlp("Gus is learning piano")

#displacy.serve(doc1,style='dep')

In [234]:
## Predicting NAMED ENTITIES
for ent in doc.ents:
    print(ent.text,"|",ent.label_,"|",spacy.explain(ent.label_))

second | ORDINAL | "first", "second", etc.
first | ORDINAL | "first", "second", etc.


In [236]:
## Displacy NAMES ENTITIES
displacy.render(doc, style="ent")

In [80]:
## RULE BASED MATCHING
## Match on Doc objects, Match on tokens and token attributes
## Use a model's predictions (Noun vs Verb)

import spacy

# Import the Matcher
from spacy.matcher import Matcher

# Load a pipeline and create the nlp object
nlp = spacy.load("en_core_web_sm")

# Process some text
#doc1 = nlp(book)

# Initialize the matcher with the shared vocab
matcher = Matcher(nlp.vocab)

# Add the pattern to the matcher
# Match patterns are lists of dictionaries
# Match using lexical attributes (IS_DIGIT, IS_PUNCT,LOWER)
# Match using token attributes (LEMMA,POS)
# Match using operators ("OP") or quantifiers (? (0,1time), + (1 or more),* (0 or more), ! (negation 0 times))
pattern = [{"LOWER": "god"}, {"LEMMA": "give"},{"POS":"PRON"}]
matcher.add("T", [pattern])

# Call the matcher on the doc
matches = matcher(doc)

# Iterate over the matches
for match_id, start, end in matches:
    # Get the matched span
    matched_span = doc[start:end]
    print(matched_span.text)

God gives us


In [100]:
## Shared Vocab
## spaCy stores all shared data in a vocabulary, the Vocab. This includes words, but also the labels schemes for tags and entities.
## To save memory, all strings are encoded to hash IDs. 

nlp.vocab.strings.add("coffee")

#look up hash value
coffee_hash = nlp.vocab.strings["coffee"]
# The doc also exposes the vocab and strings
doc_hash = doc.vocab.strings["coffee"]                 

#look up string value
coffee_string = nlp.vocab.strings[coffee_hash]
string = nlp.vocab.strings[3197928453018144401]           # If a word is not in the vocabulary, there's no way to get its string.

print(coffee_hash)
print(doc_hash)
print(coffee_string)
print(string)

3197928453018144401
3197928453018144401
coffee
coffee


In [102]:
## Lexeme
## Lexemes are context-independent entries in the vocabulary.
## Lexemes don't have context-dependant entries like part-of-speech tags, dependencies or entity labels.

doc = nlp("I love coffee")
lexeme = nlp.vocab["coffee"]

# Print the lexical attributes
print(lexeme.text, lexeme.orth, lexeme.is_alpha)

coffee 3197928453018144401 True


In [116]:
## Create Doc, Span, entities

# Import the Doc and Span classes
from spacy.tokens import Doc, Span

# The words and spaces to create the doc from
words = ["Hello", "world", "!"]
spaces = [True, False, False]

# Create a doc manually
doc = Doc(nlp.vocab, words=words, spaces=spaces)
print("doc>> ",doc)

# Create a span manually
span = Span(doc, 0, 2)

# Create a span with a label
span = Span(doc, 0, 2, label="GREETING")
print("span>> ",span)

# Add span to the doc.ents
doc.ents = [span]
print([(ent.text, ent.label_) for ent in doc.ents])

doc>>  Hello world!
span>>  Hello world
[('Hello world', 'GREETING')]


In [134]:
## Best Practice on Dat Structures

import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Berlin looks like a nice city than Sydney")

# Get all tokens and part-of-speech tags   >> use native token attributes to keep things consistent, convert the results to strings as late as possible
"""
token_texts = [token.text for token in doc]
pos_tags = [token.pos_ for token in doc]
"""

# Instead of below
"""
for index, pos in enumerate(pos_tags):
    # Check if the current token is a proper noun
    if pos == "PROPN":
        # Check if the next token is a verb
        if pos_tags[index + 1] == "VERB":
            result = token_texts[index]
            print("Found proper noun before a verb:", result)
"""

# try this
# Iterate over the tokens
for token in doc:
    # Check if the current token is a proper noun
    if token.pos_ == "PROPN":
        if token.i + 1 < len(doc):
            # Check if the next token is a verb
            if doc[token.i + 1].pos_ == "VERB":
                print("Found proper noun before a verb:", token.text)

Found proper noun before a verb: Berlin


In [142]:
## Word vectors and semantic similarity
## use spaCy to predict how similar documents, spans or tokens are to each other

## The Doc, Token and Span objects have a .similarity method that takes another object
## returns a floating point number between 0 and 1, indicating how similar they are.
## use medium or large spacy pipeline (not small)

# Load a larger pipeline with vectors
nlp = spacy.load("en_core_web_md")

# Compare two documents
doc1 = nlp("I like fast food")
doc2 = nlp("I like pizza")
print("doc similarity:",doc1.similarity(doc2))

# Compare two tokens
doc = nlp("I like pizza and pasta")
token1 = doc[2]
token2 = doc[4]
print("token similarity:",token1.similarity(token2))

doc similarity: 0.8698332283318978
token similarity: 0.685019850730896
0.1821369691957915
0.47190033157126826


In [144]:
## You can also use the similarity methods to compare different types of objects. For example, a document and a token.

# Compare a document with a token
doc = nlp("I like pizza")
token = nlp("soap")[0]
print(doc.similarity(token))

# Compare a span with a document
span = nlp("I like pizza and pasta")[2:5]
doc = nlp("McDonalds sells burgers")
print(span.similarity(doc))

0.1821369691957915
0.47190033157126826


In [148]:
## Similarity is determined using word vectors, multi-dimensional representations of meanings of words.
## Algorithms like Word2Vec (that are used to convert raw text to vectors) can be added to Spacy's pipeline
## By default, the similarity returned by spaCy is the cosine similarity between two vectors – but this can be adjusted if necessary.
## Doc and Span vectors default to average of token vectors

# Load a larger pipeline with vectors
nlp = spacy.load("en_core_web_md")
doc = nlp("I have a banana")

# Access the vector via the token.vector attribute
print(doc[3].vector)

[ 0.20778  -2.4151    0.36605   2.0139   -0.23752  -3.1952   -0.2952
  1.2272   -3.4129   -0.54969   0.32634  -1.0813    0.55626   1.5195
  0.97797  -3.1816   -0.37207  -0.86093   2.1509   -4.0845    0.035405
  3.5702   -0.79413  -1.7025   -1.6371   -3.198    -1.9387    0.91166
  0.85409   1.8039   -1.103    -2.5274    1.6365   -0.82082   1.0278
 -1.705     1.5511   -0.95633  -1.4702   -1.865    -0.19324  -0.49123
  2.2361    2.2119    3.6654    1.7943   -0.20601   1.5483   -1.3964
 -0.50819   2.1288   -2.332     1.3539   -2.1917    1.8923    0.28472
  0.54285   1.2309    0.26027   1.9542    1.1739   -0.40348   3.2028
  0.75381  -2.7179   -1.3587   -1.1965   -2.0923    2.2855   -0.3058
 -0.63174   0.70083   0.16899   1.2325    0.97006  -0.23356  -2.094
 -1.737     3.6075   -1.511    -0.9135    0.53878   0.49268   0.44751
  0.6315    1.4963    4.1725    2.1961   -1.2409    0.4214    2.9678
  1.841     3.0133   -4.4652    0.96521  -0.29787   4.3386   -1.2527
 -1.7734   -3.5637   -0.20035

In [None]:
## similarity depends on the context and what application needs to do
## Useful for many applications: recommendation systems, flagging duplicates etc.

doc1 = nlp("I like cats")
doc2 = nlp("I hate cats")

print(doc1.similarity(doc2))

## high similarity score because both texts express sentiment about cats
## in a different application context, you might want to consider the phrases as very dissimilar, because they talk about opposite sentiments.