### Reference: https://spacy.io/

In [1]:
# !pip install spacy
# !python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.3.0/en_core_web_lg-3.3.0-py3-none-any.whl (400.7 MB)
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_lg')


In [2]:
# !pip install spacy-langdetect



In [1]:
import spacy
print(spacy.__version__)

3.3.1


In [2]:
nlp = spacy.load("en_core_web_lg")

### Tokenization

In [3]:
doc = nlp("I am flying to Manila")
print([w.text for w in doc])

['I', 'am', 'flying', 'to', 'Manila']


In [4]:
for w in doc:
    print(w.text)

I
am
flying
to
Manila


### Lemmatization

In [5]:
doc = nlp("this product integrates both libraries for downloading and applying patches")
for token in doc:
    print(token.text, token.lemma_)

this this
product product
integrates integrate
both both
libraries library
for for
downloading download
and and
applying apply
patches patch


### POS tagging

In [6]:
doc=nlp('I have flown to Cebu. Now I am flying to Manila.')
for token in doc:
    print(token.text, token.pos_, token.tag_, spacy.explain(token.tag_))

I PRON PRP pronoun, personal
have AUX VBP verb, non-3rd person singular present
flown VERB VBN verb, past participle
to ADP IN conjunction, subordinating or preposition
Cebu PROPN NNP noun, proper singular
. PUNCT . punctuation mark, sentence closer
Now ADV RB adverb
I PRON PRP pronoun, personal
am AUX VBP verb, non-3rd person singular present
flying VERB VBG verb, gerund or present participle
to ADP IN conjunction, subordinating or preposition
Manila PROPN NNP noun, proper singular
. PUNCT . punctuation mark, sentence closer


In [7]:
spacy.explain("NNP")

'noun, proper singular'

### Segmentation

In [8]:
doc=nlp('I have flown to Cebu. Now I am flying to Manila.')
for sent in doc.sents:
    print([sent[i] for i in range(len(sent))])

[I, have, flown, to, Cebu, .]
[Now, I, am, flying, to, Manila, .]


### Retokenization

In [9]:
doc=nlp('The Golden State Bridge is an iconic landmark in San Francisco')
[doc[i] for i in range(len(doc))]

[The, Golden, State, Bridge, is, an, iconic, landmark, in, San, Francisco]

In [10]:
with doc.retokenize() as retokenizer:
    retokenizer.merge(doc[1:4])
with doc.retokenize() as retokenizer:
    retokenizer.merge(doc[-2:]) # or [7:9]

In [11]:
[doc[i] for i in range(len(doc))]

[The, Golden State Bridge, is, an, iconic, landmark, in, San Francisco]

### Syntactic Parsing

In [12]:
doc = nlp('I want a green apple,')
for token in doc:
    print(token.text, token.pos_, token.dep_, spacy.explain(token.dep_))

I PRON nsubj nominal subject
want VERB ROOT root
a DET det determiner
green ADJ amod adjectival modifier
apple NOUN dobj direct object
, PUNCT punct punctuation


### Visualizations

In [13]:
from spacy import displacy
displacy.serve(doc, style='dep')




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [14]:
# from IPython.core.display import display, HTML

# doc = nlp('I want to fly to Manila.')

# from spacy import displacy
# html = displacy.render(doc, style="ent", page=True)

# display(HTML(html))

In [15]:
spacy.explain("GPE")

'Countries, cities, states'

### Similarity

In [None]:
doc = nlp('I want a green apple.')

In [None]:
doc[----------:--------]

In [None]:
doc.similarity(doc[----------:--------])

In [None]:
nlp('apple').similarity(nlp('banana'))

In [None]:
nlp('lovelife').similarity(nlp('forever'))

In [None]:
nlp('apple').---------------------

### Language Detection: https://spacy.io/universe/project/spacy-langdetect

In [None]:
import spacy
from spacy.language import Language
from spacy_langdetect import LanguageDetector

def get_lang_detector(nlp, name):
    return LanguageDetector()

nlp = spacy.load("en_core_web_sm")
Language.factory("language_detector", func=get_lang_detector)
nlp.add_pipe('language_detector', last=True)

In [None]:
text = 'This is an english text.'
doc = nlp(text)

print(doc._.language)

In [None]:
text = 'This is an english text mabuhay'
doc = nlp(text)

print(doc._.language)

In [None]:
text = 'magandang gabi!'
doc = nlp(text)

print(doc._.language)