In [1]:
# @author: Praveen Dominic
import spacy
nlp = spacy.load('en_core_web_md')

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
sentence = """The names of companies and organizations should also be capitalized, such as Nike and Stanford University. 
There are some exceptions: sometimes a company may choose not to use a capital letter at the beginning of its name or product as a stylistic choice. 
Examples include eBay and the iPhone."""

In [8]:
doc = nlp(sentence)

#print sentences
for sent in doc.sents:
    print(sent.text)


The names of companies and organizations should also be capitalized, such as Nike and Stanford University. 

There are some exceptions: sometimes a company may choose not to use a capital letter at the beginning of its name or product as a stylistic choice. 

Examples include eBay and the iPhone.


In [19]:
#print words in each sentence

sentence2 = """ nationnal ecoonomy gained looked in recent weeks as con@gmer spending
Strengtened, manufacturing actiity cont@™ed to rse, and producers
scheduled more invesfgftment in plant and equipment. subatrction""" 

doc2= nlp(sentence2)

i = 1
for sent in doc2.sents:
    print(f"Words in sentence {i}:")
    for word in sent:
        print(word)
    i+=1


Words in sentence 1:
 
nationnal
ecoonomy
gained
looked
in
recent
weeks
as
con@gmer
spending


Strengtened
,
manufacturing
actiity
cont@
™
ed
to
rse
,
and
producers


scheduled
more
invesfgftment
in
plant
and
equipment
.
Words in sentence 2:
subatrction


In [47]:
# get stop words
sentence2 = """nationnal ecoonomy "gained" looked in recent weeks as con@gmer spending
Strengtened, manufacturing actiity cont@™ed to rse, and producers
scheduled more invesfgftment in plant and equipment. subatrction addition $""" 

doc2 = nlp(sentence2)

#Lemmatized list of words with stopwords excluded
for sent in doc2:
    # print(sent.is_quote) #is_sent_start, is_currency, is_oov, is_right_punct, is_stop
    processed_lst = [word.lemma_ for word in doc2 if word.is_stop==False if word.is_punct==False if word.is_currency==False]

print(processed_lst)


#List stop words
en = spacy.util.get_lang_class('en')
en.Defaults.stop_words

['nationnal', 'ecoonomy', 'gain', 'look', 'recent', 'week', 'con@gmer', 'spending', '\n', 'Strengtened', 'manufacturing', 'actiity', 'cont@', '™', 'ed', 'rse', 'producer', '\n', 'schedule', 'invesfgftment', 'plant', 'equipment', 'subatrction', 'addition']


{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'fron

In [53]:
# Modify stop words
en.Defaults.stop_words.remove('am')#remove
en.Defaults.stop_words.add('Dominic')#add
en.Defaults.stop_words  
#NOTE THAT STOP WORDS REMOVAL HAS TO BE DONE BEFORE LOADING THE MODEL. 

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'Dominic',
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'among',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'front',
 'f

In [65]:
#Named entity recognition
for word in doc.ents:
    print(word.text)
    print(word.label_)
    print("****")

from spacy import displacy
displacy.render(doc, style ="ent")


Nike
ORG
****
Stanford University
ORG
****
eBay
ORG
****


In [73]:
#Parts of speech
for word in doc:
    print(word.text)
    print(word.pos_)
    print("***")

displacy.render(doc, style = "dep") #dependency parsing tree

The
DET
***
names
NOUN
***
of
ADP
***
companies
NOUN
***
and
CCONJ
***
organizations
NOUN
***
should
AUX
***
also
ADV
***
be
AUX
***
capitalized
VERB
***
,
PUNCT
***
such
ADJ
***
as
ADP
***
Nike
PROPN
***
and
CCONJ
***
Stanford
PROPN
***
University
PROPN
***
.
PUNCT
***


SPACE
***
There
PRON
***
are
VERB
***
some
DET
***
exceptions
NOUN
***
:
PUNCT
***
sometimes
ADV
***
a
DET
***
company
NOUN
***
may
AUX
***
choose
VERB
***
not
PART
***
to
PART
***
use
VERB
***
a
DET
***
capital
NOUN
***
letter
NOUN
***
at
ADP
***
the
DET
***
beginning
NOUN
***
of
ADP
***
its
PRON
***
name
NOUN
***
or
CCONJ
***
product
NOUN
***
as
ADP
***
a
DET
***
stylistic
ADJ
***
choice
NOUN
***
.
PUNCT
***


SPACE
***
Examples
NOUN
***
include
VERB
***
eBay
PROPN
***
and
CCONJ
***
the
DET
***
iPhone
PROPN
***
.
PUNCT
***
