In [17]:
import spacy

In [18]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("I love coffe")

In [27]:
print(f"Hash Value - {nlp.vocab.strings["coffee"]}")

Hash Value - 3197928453018144401


In [31]:
print(f"Hash Value - {nlp.vocab.strings[3197928453018144401]}")

Hash Value - coffee


In [7]:
# Doc -> Linguistic attributes (dependency , pos)
# Vocab -> Lexical attribute (IS_DIGIT , IS_ALPHA , TEXT)
# Vocab (knows Hash ID of every string) -> To get string -> Look-up table [string:hash-id] 

In [13]:
doc = ("I have a cat")

cat_hash = nlp.vocab.strings["cat"]
cat_hash

5439657043933447811

In [14]:
nlp.vocab.strings[5439657043933447811]

'cat'

In [35]:
from spacy.tokens import Doc , Span

nlp = spacy.blank("en") # initiating nlp object 

words = ["hello" , "world" , "!"]
spaces = [True , False , False]

doc = Doc(nlp.vocab , words = words , spaces = spaces)

In [36]:
doc

hello world!

In [47]:
type(Span(doc , 1 , 3))

spacy.tokens.span.Span

In [43]:
span_with_label = Span(doc , 0 ,2 ,  label = "GREETING")
doc.ents = [span_with_label] # doc.ents is writable , so as per our task we can add entities manually if not in trained data/model

In [45]:
for ent in doc.ents:
    print(ent.label_ , ent.text)

GREETING hello world


In [51]:
from spacy.tokens import Doc , Span

words = ["I", "like", "David", "Bowie"]
spaces = [True, True, True, False]

doc = Doc(nlp.vocab , words = words , spaces = spaces)

span_name_label = Span(doc , 2 , 4 , label = "PERSON")

print(span_name_label.label_ , span_name_label.text)

PERSON David Bowie


In [52]:
doc.ents = [span_name_label]

In [53]:
[print(doc.ents) for ent in doc.ents]

(David Bowie,)


[None]

In [54]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Berlin looks like a nice city")

print(doc[0].pos_)

PROPN


In [58]:
# The code in this example is trying to analyze a text and collect all proper nouns that are followed by a verb. (in the most optimized way)

nlp = spacy.load("en_core_web_sm")
doc = nlp("Berlin looks like a nice city")

for token in doc:
    if token.pos_ == "PROPN":
        index = token.i + 1
        next_token = doc[index]
        if next_token.pos_ == "VERB":
            print(token.text , next_token.text)

Berlin looks


In [62]:
# To compare similarity , use -> medium and large models , they only have vectors for comparing similarity
import spacy

nlp = spacy.load("en_core_web_md")

doc1 = nlp ("I like fast food")
doc2 = nlp("I like pizza")

print(doc1.similarity(doc2))

# Comparing two sentences

0.8382381200790405


In [63]:
print(doc.similarity(doc1))

0.8382381200790405


In [71]:
doc = nlp("I like pizza and pasta")
token_1 = doc[2]
token_2 = doc[4]
print(token_1.similarity(token_2))

1.0000001192092896


In [74]:
doc = nlp("Indian Institute of Technology")
token = nlp("collge")[0]

print(doc.similarity(token))

# comparing a token with a sentence

0.1310572326183319


In [75]:
span = nlp("Developed an app which works end to end")[0:5]
doc = nlp("Work experience is essential")

print(span.similarity(doc))

0.7478198409080505


In [76]:
# Doc and Span vectors default to average of token vectors

doc = nlp("India is a country")

In [80]:
print(len(doc[-1].vector)) # 300 size vector

300


In [81]:
print(doc.vector)

[-6.75987482e-01  2.95220017e-01  5.53175062e-02  7.55575001e-02
  2.32170239e-01  2.18925253e-03  3.40092510e-01 -1.78802505e-01
  5.36397509e-02  2.01376987e+00 -4.94089961e-01 -3.38320017e-01
 -2.07768500e-01 -8.56000185e-03 -3.16640027e-02 -1.35674998e-02
 -2.19487503e-01  1.29222238e+00  7.07562789e-02  1.13817498e-01
 -2.82294989e-01 -2.52393246e-01  1.69265002e-01 -2.64062762e-01
 -7.14329928e-02 -7.96277449e-02  2.72385001e-01  8.87579992e-02
 -2.39240006e-01 -8.01199824e-02  8.09949934e-02 -4.08015013e-01
 -7.13434964e-02 -1.95002496e-01 -3.27162504e-01  8.67996365e-04
 -1.15750000e-01 -2.64487505e-01 -2.05805004e-01 -6.94500059e-02
  2.80475020e-01 -2.13334009e-01  1.52083606e-01 -1.83649004e-01
  2.16212999e-02 -3.12228501e-01  2.55757514e-02  1.40500069e-02
  1.02404952e-02  7.96445012e-02 -2.58964747e-01  4.23509002e-01
 -9.63289961e-02 -2.66611516e-01  2.10869998e-01  1.01421498e-01
  1.41084492e-01 -1.27255023e-02  1.47844747e-01 -1.27790004e-01
 -1.93052500e-01 -2.81198

In [82]:
doc_vector = []

for token in doc:
    doc_vector.append(token.vector)

In [84]:
import torch

tensor = torch.tensor(doc_vector)

  tensor = torch.tensor(doc_vector)


In [85]:
tensor

tensor([[-0.8955,  0.3877,  0.6498,  ...,  0.1375, -0.2671,  0.5757],
        [-0.6005,  0.1884, -0.4099,  ..., -0.2780,  0.3123, -0.2833],
        [-0.6005,  0.1884, -0.4099,  ..., -0.2780,  0.3123, -0.2833],
        [-0.6074,  0.4164,  0.3913,  ...,  0.0558, -0.5056, -0.1939]])

In [86]:
column_average = tensor.float().mean(dim=0)

In [87]:
column_average

tensor([-6.7599e-01,  2.9522e-01,  5.5318e-02,  7.5558e-02,  2.3217e-01,
         2.1893e-03,  3.4009e-01, -1.7880e-01,  5.3640e-02,  2.0138e+00,
        -4.9409e-01, -3.3832e-01, -2.0777e-01, -8.5600e-03, -3.1664e-02,
        -1.3567e-02, -2.1949e-01,  1.2922e+00,  7.0756e-02,  1.1382e-01,
        -2.8229e-01, -2.5239e-01,  1.6927e-01, -2.6406e-01, -7.1433e-02,
        -7.9628e-02,  2.7239e-01,  8.8758e-02, -2.3924e-01, -8.0120e-02,
         8.0995e-02, -4.0802e-01, -7.1343e-02, -1.9500e-01, -3.2716e-01,
         8.6800e-04, -1.1575e-01, -2.6449e-01, -2.0581e-01, -6.9450e-02,
         2.8048e-01, -2.1333e-01,  1.5208e-01, -1.8365e-01,  2.1621e-02,
        -3.1223e-01,  2.5576e-02,  1.4050e-02,  1.0240e-02,  7.9645e-02,
        -2.5896e-01,  4.2351e-01, -9.6329e-02, -2.6661e-01,  2.1087e-01,
         1.0142e-01,  1.4108e-01, -1.2726e-02,  1.4784e-01, -1.2779e-01,
        -1.9305e-01, -2.8120e-01, -4.5664e-02, -2.7264e-01, -1.6292e-02,
        -3.1253e-01, -3.6898e-02,  1.3226e-01,  2.6

In [89]:
tensor_conversion = torch.tensor(doc.vector)

In [92]:
bool = True
for i,j in zip(tensor_conversion,column_average):
    if i != j :
        bool = False

bool        

True

In [93]:
# Combining the power of Machine learning Models along with rule based methods is very powerful thing
# span is segment of doc , is same data structure as doc , so span have all lingustic attributes
# If we iterate over the matches returned by the matcher, we can get the match ID and the start and end index of the matched span. We can then find out more about it. Span objects give us access to the original document and all other token attributes and linguistic features predicted by a model.

In [95]:
# Phrase matcher -> Regular expression + token access
# Use of token access ? -> can get lingustic attributes of token and also get left and right span of the extracted word match

In [97]:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)
pattern = nlp("Golden Retriever") # instead of list of dictionaries , in this we give doc object
matcher.add("DOG" , [pattern])

doc = nlp("I have a dog of breed Golden Retriever")

for match_id , start , end in matcher(doc):
    span = doc[start:end] # -> access of the token
    print("Matched span" , span.text)


Matched span Golden Retriever


In [105]:
# Both patterns in this exercise contain mistakes and won’t match as expected. Can you fix them? If you get stuck, try printing the tokens in the doc to see how the text will be split and adjust the pattern so that each dictionary represents one token.

# Edit pattern1 so that it correctly matches all case-insensitive mentions of "Amazon" plus a title-cased proper noun.
# Edit pattern2 so that it correctly matches all case-insensitive mentions of "ad-free", plus the following noun.

import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
doc = nlp(
    "Twitch Prime, the perks program for Amazon Prime members offering free "
    "loot, games and other benefits, is ditching one of its best features: "
    "ad-free viewing. According to an email sent out to Amazon Prime members "
    "today, ad-free viewing will no longer be included as a part of Twitch "
    "Prime for new members, beginning on September 14. However, members with "
    "existing annual subscriptions will be able to continue to enjoy ad-free "
    "viewing until their subscription comes up for renewal. Those with "
    "monthly subscriptions will have access to ad-free viewing until October 15."
)

temp = []

for token in doc:
    temp.append(token)

# Create the match patterns
pattern1 = [{"LOWER": "Amazon"}]
pattern2 = [{"LOWER": "ad-free"}, {"POS": "NOUN"}]

# Initialize the Matcher and add the patterns
matcher = Matcher(nlp.vocab)
matcher.add("PATTERN1", [pattern1])
matcher.add("PATTERN2", [pattern2])

# Iterate over the matches
for match_id, start, end in matcher(doc):
    # Print pattern string name and text of matched span
    print(doc.vocab.strings[match_id], doc[start:end].text)

In [100]:
temp

[Twitch,
 Prime,
 ,,
 the,
 perks,
 program,
 for,
 Amazon,
 Prime,
 members,
 offering,
 free,
 loot,
 ,,
 games,
 and,
 other,
 benefits,
 ,,
 is,
 ditching,
 one,
 of,
 its,
 best,
 features,
 :,
 ad,
 -,
 free,
 viewing,
 .,
 According,
 to,
 an,
 email,
 sent,
 out,
 to,
 Amazon,
 Prime,
 members,
 today,
 ,,
 ad,
 -,
 free,
 viewing,
 will,
 no,
 longer,
 be,
 included,
 as,
 a,
 part,
 of,
 Twitch,
 Prime,
 for,
 new,
 members,
 ,,
 beginning,
 on,
 September,
 14,
 .,
 However,
 ,,
 members,
 with,
 existing,
 annual,
 subscriptions,
 will,
 be,
 able,
 to,
 continue,
 to,
 enjoy,
 ad,
 -,
 free,
 viewing,
 until,
 their,
 subscription,
 comes,
 up,
 for,
 renewal,
 .,
 Those,
 with,
 monthly,
 subscriptions,
 will,
 have,
 access,
 to,
 ad,
 -,
 free,
 viewing,
 until,
 October,
 15,
 .]