In [4]:
## Load text
enc='utf-8'
with open("African_helmeted_turtle.txt","r",encoding=enc) as f:
    book=f.read()
    print(book)

The African helmeted turtle #$%!!! (Pelomedusa subrufa) is a species of side-necked terrapin in the family Pelomedusidae. The species naturally occurs in fresh and stagnant water bodies throughout much of sub-Saharan Africa, and in southern Yemen. It is omnivorous, with its diet consisting mainly of aquatic invertebrates, small fish, and vegetation. It is typically a small turtle, with most individuals being less than 20 centimetres (7.9 inches) in straight carapace length. The female lays two to ten eggs on average, normally during late spring and early summer. The eggs are placed in a flask-shaped nest about 4 to 7 inches (10 to 18 centimetres) deep and hatch in 75 to 90 days. This African helmeted turtle was photographed in Phinda Private Game Reserve, South Africa.


In [5]:
import spacy
nlp = spacy.load("en_core_web_sm")
# nlp = spacy.load("en_core_web_md")
# nlp = spacy.load("en_core_web_lg")
# nlp = spacy.load("en_core_web_trf")

In [6]:
## check pipeline
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x20d5b11d970>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x20d5b11dc10>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x20d5b158ba0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x20d5b388d90>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x20d5b393950>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x20d5b158cf0>)]

In [24]:
## process text with nlp object to create a doc object
##doc=nlp(book)
#type(doc)

In [25]:
## Span object is a slice of the document consisting of one or more tokens. 
doc[0:7]

The African helmeted turtle #$%

## Text Preprocessing

### Clean text, punctuation, lower case

### Tokenize Text

In [27]:
## sentence / word tokenization
def spacy_tokenizer(text):
    #token=[sentence for sentence in text.sents]
    tokens=nlp(text)
    return tokens

In [28]:
tokens=spacy_tokenizer(book)
print("TOKEN:",tokens)

TOKEN: The African helmeted turtle #$%!!! (Pelomedusa subrufa) is a species of side-necked terrapin in the family Pelomedusidae. The species naturally occurs in fresh and stagnant water bodies throughout much of sub-Saharan Africa, and in southern Yemen. It is omnivorous, with its diet consisting mainly of aquatic invertebrates, small fish, and vegetation. It is typically a small turtle, with most individuals being less than 20 centimetres (7.9 inches) in straight carapace length. The female lays two to ten eggs on average, normally during late spring and early summer. The eggs are placed in a flask-shaped nest about 4 to 7 inches (10 to 18 centimetres) deep and hatch in 75 to 90 days. This African helmeted turtle was photographed in Phinda Private Game Reserve, South Africa.


### Lemmatization, stop words, punctuation

In [208]:
def spacy_text_lemmatize(text):
    from spacy.lang.en.stop_words import STOP_WORDS
    stop_words=spacy.lang.en.stop_words.STOP_WORDS
    p=[(token.lemma_,token.pos_) for token in text]
    t1=[t for t in p if t[1] not in ["SPACE", "PUNCT", "SYM","X"] and t[0] not in stop_words]
    return t1

In [209]:
lemmatized_pos_list=spacy_text_lemmatize(tokens)
print("LEMMATISED LIST:",lemmatized_pos_list)

LEMMATISED LIST: [('african', 'ADJ'), ('helmeted', 'ADJ'), ('turtle', 'NOUN'), ('%', 'NOUN'), ('Pelomedusa', 'PROPN'), ('subrufa', 'NOUN'), ('species', 'NOUN'), ('neck', 'VERB'), ('terrapin', 'NOUN'), ('family', 'NOUN'), ('Pelomedusidae', 'PROPN'), ('specie', 'NOUN'), ('naturally', 'ADV'), ('occur', 'VERB'), ('fresh', 'ADJ'), ('stagnant', 'ADJ'), ('water', 'NOUN'), ('body', 'NOUN'), ('sub', 'ADJ'), ('-', 'ADJ'), ('saharan', 'ADJ'), ('Africa', 'PROPN'), ('southern', 'ADJ'), ('Yemen', 'PROPN'), ('omnivorous', 'ADJ'), ('diet', 'NOUN'), ('consist', 'VERB'), ('mainly', 'ADV'), ('aquatic', 'ADJ'), ('invertebrate', 'NOUN'), ('small', 'ADJ'), ('fish', 'NOUN'), ('vegetation', 'NOUN'), ('typically', 'ADV'), ('small', 'ADJ'), ('turtle', 'NOUN'), ('individual', 'NOUN'), ('20', 'NUM'), ('centimetre', 'NOUN'), ('7.9', 'NUM'), ('inch', 'NOUN'), ('straight', 'ADJ'), ('carapace', 'NOUN'), ('length', 'NOUN'), ('female', 'ADJ'), ('lay', 'NOUN'), ('egg', 'NOUN'), ('average', 'ADJ'), ('normally', 'ADV'), (

In [106]:
token=spacy_tokenizer(doc)
print("TOKEN:",token)
print("----------")
stopword_filtered_list=spacy_remove_stopwords(token)
print("STOPWORD FILTERED LIST:",stopword_filtered_list)
print("----------")
lemmatized_list=spacy_text_lemmatize(stopword_filtered_list)
print("LEMMATISED LIST:",lemmatized_list)
print("----------")
pos_tagged=spacy_pos_tag(lemmatized_list)
print("POS TAG:",pos_tagged)
print("----------")
ner_tagged=extract_ne(pos_tagged)
print("NER TAGGED:",ner_tagged)

TOKEN: [('The', 0), ('African', 4), ('helmeted', 12), ('turtle', 21), ('#', 28), ('$', 29), ('%', 30), ('!', 31), ('!', 32), ('!', 33), ('(', 35), ('Pelomedusa', 36), ('subrufa', 47), (')', 54), ('is', 56), ('a', 59), ('species', 61), ('of', 69), ('side', 72), ('-', 76), ('necked', 77), ('terrapin', 84), ('in', 93), ('the', 96), ('family', 100), ('Pelomedusidae', 107), ('.', 120), ('The', 122), ('species', 126), ('naturally', 134), ('occurs', 144), ('in', 151), ('fresh', 154), ('and', 160), ('stagnant', 164), ('water', 173), ('bodies', 179), ('throughout', 186), ('much', 197), ('of', 202), ('sub', 205), ('-', 208), ('Saharan', 209), ('Africa', 217), (',', 223), ('and', 225), ('in', 229), ('southern', 232), ('Yemen', 241), ('.', 246), ('It', 248), ('is', 251), ('omnivorous', 254), (',', 264), ('with', 266), ('its', 271), ('diet', 275), ('consisting', 280), ('mainly', 291), ('of', 298), ('aquatic', 301), ('invertebrates', 309), (',', 322), ('small', 324), ('fish', 330), (',', 334), ('and

NameError: name 'extract_ne' is not defined

In [234]:
## Predicting NAMED ENTITIES
for ent in doc.ents:
    print(ent.text,"|",ent.label_,"|",spacy.explain(ent.label_))

second | ORDINAL | "first", "second", etc.
first | ORDINAL | "first", "second", etc.


In [236]:
## Displacy NAMES ENTITIES
displacy.render(doc, style="ent")

In [80]:
## RULE BASED MATCHING
## Match on Doc objects, Match on tokens and token attributes
## Use a model's predictions (Noun vs Verb)

import spacy

# Import the Matcher
from spacy.matcher import Matcher

# Load a pipeline and create the nlp object
nlp = spacy.load("en_core_web_sm")

# Process some text
#doc1 = nlp(book)

# Initialize the matcher with the shared vocab
matcher = Matcher(nlp.vocab)

# Add the pattern to the matcher
# Match patterns are lists of dictionaries
# Match using lexical attributes (IS_DIGIT, IS_PUNCT,LOWER)
# Match using token attributes (LEMMA,POS)
# Match using operators ("OP") or quantifiers (? (0,1time), + (1 or more),* (0 or more), ! (negation 0 times))
pattern = [{"LOWER": "god"}, {"LEMMA": "give"},{"POS":"PRON"}]
matcher.add("T", [pattern])

# Call the matcher on the doc
matches = matcher(doc)

# Iterate over the matches
for match_id, start, end in matches:
    # Get the matched span
    matched_span = doc[start:end]
    print(matched_span.text)

God gives us


In [100]:
## Shared Vocab
## spaCy stores all shared data in a vocabulary, the Vocab. This includes words, but also the labels schemes for tags and entities.
## To save memory, all strings are encoded to hash IDs. 

nlp.vocab.strings.add("coffee")

#look up hash value
coffee_hash = nlp.vocab.strings["coffee"]
# The doc also exposes the vocab and strings
doc_hash = doc.vocab.strings["coffee"]                 

#look up string value
coffee_string = nlp.vocab.strings[coffee_hash]
string = nlp.vocab.strings[3197928453018144401]           # If a word is not in the vocabulary, there's no way to get its string.

print(coffee_hash)
print(doc_hash)
print(coffee_string)
print(string)

3197928453018144401
3197928453018144401
coffee
coffee


In [102]:
## Lexeme
## Lexemes are context-independent entries in the vocabulary.
## Lexemes don't have context-dependant entries like part-of-speech tags, dependencies or entity labels.

doc = nlp("I love coffee")
lexeme = nlp.vocab["coffee"]

# Print the lexical attributes
print(lexeme.text, lexeme.orth, lexeme.is_alpha)

coffee 3197928453018144401 True


In [116]:
## Create Doc, Span, entities

# Import the Doc and Span classes
from spacy.tokens import Doc, Span

# The words and spaces to create the doc from
words = ["Hello", "world", "!"]
spaces = [True, False, False]

# Create a doc manually
doc = Doc(nlp.vocab, words=words, spaces=spaces)
print("doc>> ",doc)

# Create a span manually
span = Span(doc, 0, 2)

# Create a span with a label
span = Span(doc, 0, 2, label="GREETING")
print("span>> ",span)

# Add span to the doc.ents
doc.ents = [span]
print([(ent.text, ent.label_) for ent in doc.ents])

doc>>  Hello world!
span>>  Hello world
[('Hello world', 'GREETING')]


In [134]:
## Best Practice on Dat Structures

import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Berlin looks like a nice city than Sydney")

# Get all tokens and part-of-speech tags   >> use native token attributes to keep things consistent, convert the results to strings as late as possible
"""
token_texts = [token.text for token in doc]
pos_tags = [token.pos_ for token in doc]
"""

# Instead of below
"""
for index, pos in enumerate(pos_tags):
    # Check if the current token is a proper noun
    if pos == "PROPN":
        # Check if the next token is a verb
        if pos_tags[index + 1] == "VERB":
            result = token_texts[index]
            print("Found proper noun before a verb:", result)
"""

# try this
# Iterate over the tokens
for token in doc:
    # Check if the current token is a proper noun
    if token.pos_ == "PROPN":
        if token.i + 1 < len(doc):
            # Check if the next token is a verb
            if doc[token.i + 1].pos_ == "VERB":
                print("Found proper noun before a verb:", token.text)

Found proper noun before a verb: Berlin


In [142]:
## Word vectors and semantic similarity
## use spaCy to predict how similar documents, spans or tokens are to each other

## The Doc, Token and Span objects have a .similarity method that takes another object
## returns a floating point number between 0 and 1, indicating how similar they are.
## use medium or large spacy pipeline (not small)

# Load a larger pipeline with vectors
nlp = spacy.load("en_core_web_md")

# Compare two documents
doc1 = nlp("I like fast food")
doc2 = nlp("I like pizza")
print("doc similarity:",doc1.similarity(doc2))

# Compare two tokens
doc = nlp("I like pizza and pasta")
token1 = doc[2]
token2 = doc[4]
print("token similarity:",token1.similarity(token2))

doc similarity: 0.8698332283318978
token similarity: 0.685019850730896
0.1821369691957915
0.47190033157126826


In [144]:
## You can also use the similarity methods to compare different types of objects. For example, a document and a token.

# Compare a document with a token
doc = nlp("I like pizza")
token = nlp("soap")[0]
print(doc.similarity(token))

# Compare a span with a document
span = nlp("I like pizza and pasta")[2:5]
doc = nlp("McDonalds sells burgers")
print(span.similarity(doc))

0.1821369691957915
0.47190033157126826


In [148]:
## Similarity is determined using word vectors, multi-dimensional representations of meanings of words.
## Algorithms like Word2Vec (that are used to convert raw text to vectors) can be added to Spacy's pipeline
## By default, the similarity returned by spaCy is the cosine similarity between two vectors – but this can be adjusted if necessary.
## Doc and Span vectors default to average of token vectors

# Load a larger pipeline with vectors
nlp = spacy.load("en_core_web_md")
doc = nlp("I have a banana")

# Access the vector via the token.vector attribute
print(doc[3].vector)

[ 0.20778  -2.4151    0.36605   2.0139   -0.23752  -3.1952   -0.2952
  1.2272   -3.4129   -0.54969   0.32634  -1.0813    0.55626   1.5195
  0.97797  -3.1816   -0.37207  -0.86093   2.1509   -4.0845    0.035405
  3.5702   -0.79413  -1.7025   -1.6371   -3.198    -1.9387    0.91166
  0.85409   1.8039   -1.103    -2.5274    1.6365   -0.82082   1.0278
 -1.705     1.5511   -0.95633  -1.4702   -1.865    -0.19324  -0.49123
  2.2361    2.2119    3.6654    1.7943   -0.20601   1.5483   -1.3964
 -0.50819   2.1288   -2.332     1.3539   -2.1917    1.8923    0.28472
  0.54285   1.2309    0.26027   1.9542    1.1739   -0.40348   3.2028
  0.75381  -2.7179   -1.3587   -1.1965   -2.0923    2.2855   -0.3058
 -0.63174   0.70083   0.16899   1.2325    0.97006  -0.23356  -2.094
 -1.737     3.6075   -1.511    -0.9135    0.53878   0.49268   0.44751
  0.6315    1.4963    4.1725    2.1961   -1.2409    0.4214    2.9678
  1.841     3.0133   -4.4652    0.96521  -0.29787   4.3386   -1.2527
 -1.7734   -3.5637   -0.20035

In [None]:
## similarity depends on the context and what application needs to do
## Useful for many applications: recommendation systems, flagging duplicates etc.

doc1 = nlp("I like cats")
doc2 = nlp("I hate cats")

print(doc1.similarity(doc2))

## high similarity score because both texts express sentiment about cats
## in a different application context, you might want to consider the phrases as very dissimilar, because they talk about opposite sentiments.