# BASICS OF spaCy

In [1]:
# loading spacy library
import spacy
nlp=spacy.load('en') 

## Reading a Text Document

In [2]:
doc1=nlp("We are learning spaCy")
doc1

We are learning spaCy

## Reading a File

In [3]:
myfile=open("learnspaCy.txt").read()
myfile

'Hello Guys,\n\nWe are learning spaCy , a cool nlp library.\n\nSome of its Features are:-\n\nEasy deep learning integration.\nNon-destructive tokenization.\nExport to numpy data arrays.\nNamed entity recognition.\nSupport for 51+ languages.\nPre-trained word vectors.\nState-of-the-art speed.\nPart-of-speech tagging.\nRobust, rigorously evaluated accuracy and many more\n'

In [4]:
file1=nlp(myfile)
file1

Hello Guys,

We are learning spaCy , a cool nlp library.

Some of its Features are:-

Easy deep learning integration.
Non-destructive tokenization.
Export to numpy data arrays.
Named entity recognition.
Support for 51+ languages.
Pre-trained word vectors.
State-of-the-art speed.
Part-of-speech tagging.
Robust, rigorously evaluated accuracy and many more

## Sentence Tokenization

In [5]:
for num,sentence in enumerate(file1.sents):
    print(f'{num}:{sentence}')

0:Hello Guys,

We are learning spaCy , a cool nlp library.


1:Some of its Features are:-

Easy deep learning integration.

2:Non-destructive tokenization.

3:Export to numpy data arrays.

4:Named entity recognition.

5:Support for 51+ languages.

6:Pre-trained word vectors.

7:State-of-the-art speed.

8:Part-of-speech tagging.

9:Robust, rigorously evaluated accuracy and many more



## Word Tokenization

In [6]:
doc1

We are learning spaCy

In [7]:
for token in doc1:
    print(token.text)

We
are
learning
spaCy


In [8]:
# For getting list of words , use split() method

doc1.text.split(" ")

['We', 'are', 'learning', 'spaCy']

## Word Properties

In [9]:
doc2=nlp("I have 3 coins and a 10 rupee note")
doc2

I have 3 coins and a 10 rupee note

In [10]:
## is_alpha property
for word in doc2:
    print(word.text,word.is_alpha)

I True
have True
3 False
coins True
and True
a True
10 False
rupee True
note True


In [11]:
## is_stop property
for word in doc2:
    print(word.text,word.is_stop)

I True
have True
3 False
coins False
and True
a True
10 False
rupee False
note False


In [12]:
## shape property
for word in doc1:
    print(word.text,word.shape_)

We Xx
are xxx
learning xxxx
spaCy xxxXx


## Part of speech Tagging

In [13]:
## .pos_ property
for word in doc1:
    print(word.text,word.pos_)

We PRON
are VERB
learning VERB
spaCy NOUN


In [14]:
## .tag_ property
for word in doc1:
    print(word.text,word.pos_,word.tag_)

We PRON PRP
are VERB VBP
learning VERB VBG
spaCy NOUN NN


In [15]:
## meaning of pos abbrev.
spacy.explain('NN')

'noun, singular or mass'

In [16]:
spacy.explain('VBP')

'verb, non-3rd person singular present'

## Visual dependency using displacy

In [17]:
from spacy import displacy

In [18]:
displacy.render(doc1,style='dep',jupyter=True)

## Lemmatization

In [19]:
doc3=nlp("playing played player")

In [20]:
for word in doc3:
    print(word.text,word.lemma_)

playing play
played play
player player


In [21]:
doc4=nlp("walks walk walked")

In [22]:
for word in doc4:
    print(word.text,word.lemma_,word.pos_)

walks walk NOUN
walk walk VERB
walked walk VERB


## Named Entity Recognition or Detection

In [23]:
doc5=nlp("By 2025 , India will grow so much in Technical field and earn more than 5 million dollars")

In [24]:
for word in doc5.ents:
    print(word.text,word.label_)

2025 DATE
India GPE
Technical GPE
more than 5 million dollars MONEY


In [25]:
spacy.explain('GPE')

'Countries, cities, states'

In [26]:
displacy.render(doc5,style='ent',jupyter=True)

## Semantic Similarity


In [27]:
word1=nlp("dog")
word2=nlp("cat")

In [28]:
## similarity between words
word1.similarity(word2)

  "__main__", mod_spec)


0.7952184229586672

In [29]:
doc5=nlp("cat dog bird fish")

In [30]:
## similarity between words in a sentence

for w1 in doc5:
    for w2 in doc5:
        print((w1.text,w2.text),"Similarly :-",w1.similarity(w2))

('cat', 'cat') Similarly :- 1.0
('cat', 'dog') Similarly :- 0.6008464
('cat', 'bird') Similarly :- 0.62692624
('cat', 'fish') Similarly :- 0.33569032
('dog', 'cat') Similarly :- 0.6008464
('dog', 'dog') Similarly :- 1.0
('dog', 'bird') Similarly :- 0.7429262
('dog', 'fish') Similarly :- 0.2662138
('bird', 'cat') Similarly :- 0.62692624
('bird', 'dog') Similarly :- 0.7429262
('bird', 'bird') Similarly :- 1.0
('bird', 'fish') Similarly :- 0.39694816
('fish', 'cat') Similarly :- 0.33569032
('fish', 'dog') Similarly :- 0.2662138
('fish', 'bird') Similarly :- 0.39694816
('fish', 'fish') Similarly :- 1.0


  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)


## Stopwords

In [31]:
from spacy.lang.en.stop_words import STOP_WORDS

In [32]:
print(STOP_WORDS)

{'through', 'thence', 'out', 'on', 'should', 'an', 'herself', 'front', 'beyond', 'without', '’ll', 'any', 'down', 'cannot', 'five', 'nowhere', 'indeed', 'much', 'too', 'how', 'sometime', 'everything', 'otherwise', 'along', 'upon', 'him', 'make', 'her', 'various', 'well', 'ourselves', 'in', 'toward', 'whereupon', 'you', 'some', 'has', '‘ll', 'latter', 'throughout', 'only', 'least', 'myself', 'amount', 'anyone', 'though', 'hereupon', 'their', 'which', 'made', 'hereby', 'therefore', 'empty', 'formerly', 'is', "'ll", 'and', 'still', 'these', '’ve', 'n’t', 'four', '‘ve', 'was', 'put', 'fifteen', 'its', 'being', 'although', 'full', 'none', 'my', 'am', 'anyway', 'never', 'enough', 'someone', 'around', 'than', 'get', 'would', 'thus', 'under', 'do', 'seemed', 'one', 'due', 'here', "n't", 'had', 'very', 'or', 'n‘t', 'thru', 'between', 'of', 'two', 'nine', 'almost', 'bottom', 'amongst', 'during', 'yourself', 'but', 'afterwards', 'seems', 'from', 'really', 'latterly', 'against', '‘m', 'name', 'see

In [33]:
STOP_WORDS.add('ohh')
nlp.vocab["ohh"].is_stop

True