# SpaCy RoBERTa

In [1]:
# Word tokenization
from spacy.lang.en import English

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = English()

text = """When learning data science, you shouldn't get discouraged!
Challenges and setbacks aren't failures, they're just part of the journey. You've got this!"""

#  "nlp" Object is used to create documents with linguistic annotations.
my_doc = nlp(text)

# Create list of word tokens
token_list = []
for token in my_doc:
    token_list.append(token.text)
print(token_list)

['When', 'learning', 'data', 'science', ',', 'you', 'should', "n't", 'get', 'discouraged', '!', '\n', 'Challenges', 'and', 'setbacks', 'are', "n't", 'failures', ',', 'they', "'re", 'just', 'part', 'of', 'the', 'journey', '.', 'You', "'ve", 'got', 'this', '!']


In [2]:
#Stop words
#importing stop words from English language.
import spacy
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

#Printing the total number of stop words:
print('Number of stop words: %d' % len(spacy_stopwords))

#Printing first ten stop words:
print('First ten stop words: %s' % list(spacy_stopwords)[:20])

Number of stop words: 326
First ten stop words: ['afterwards', 'her', 'becomes', 'that', 'eleven', 'meanwhile', 'per', 'former', 'thereby', 'thereafter', 'toward', 'on', 'hence', 'something', 'became', 'formerly', 'else', 'one', 'elsewhere', 'a']


In [3]:
#Implementation of stop words:
filtered_sent=[]

#  "nlp" Object is used to create documents with linguistic annotations.
doc = nlp(text)

# filtering stop words
for word in doc:
    if word.is_stop==False:
        filtered_sent.append(word)
print("Filtered Sentence:",filtered_sent)

Filtered Sentence: [learning, data, science, ,, discouraged, !, 
, Challenges, setbacks, failures, ,, journey, ., got, !]


In [4]:
# Implementing lemmatization
lem = nlp("run runs running runner")
# finding lemma for each word
for word in lem:
    print(word.text,word.lemma_)

run run
runs runs
running running
runner runner


## SpaCy Transformer | RoBERTa

Provides weights and configuration for the pretrained transformer model roberta-base, published by Facebook. The package uses HuggingFace's transformers implementation of the model. Pretrained transformer models assign detailed contextual word representations, using knowledge drawn from a large corpus of unlabelled text. You can use the contextual word representations as features in a variety of pipeline components that can be trained on your own data.


In [5]:
# POS tagging

# importing the model en_core_web_sm of English for vocabluary, syntax & entities
#import en_core_web_sm
import en_trf_robertabase_lg

# load en_core_web_sm of English for vocabluary, syntax & entities
nlp = en_trf_robertabase_lg.load()

#  "nlp" Objectis used to create documents with linguistic annotations.
nytimes= nlp(u"""New York City on Tuesday declared a public health emergency and ordered mandatory measles vaccinations amid an outbreak, becoming the latest national flash point over refusals to inoculate against dangerous diseases.

At least 285 people have contracted measles in the city since September, mostly in Brooklyn’s Williamsburg neighborhood. The order covers four Zip codes there, Mayor Bill de Blasio (D) said Tuesday.

The mandate orders all unvaccinated people in the area, including a concentration of Orthodox Jews, to receive inoculations, including for children as young as 6 months old. Anyone who resists could be fined up to $1,000.""")

for word in nytimes:
    print(word.text,word.pos_)

New 
York 
City 
on 
Tuesday 
declared 
a 
public 
health 
emergency 
and 
ordered 
mandatory 
measles 
vaccinations 
amid 
an 
outbreak 
, 
becoming 
the 
latest 
national 
flash 
point 
over 
refusals 
to 
inoculate 
against 
dangerous 
diseases 
. 


 
At 
least 
285 
people 
have 
contracted 
measles 
in 
the 
city 
since 
September 
, 
mostly 
in 
Brooklyn 
’s 
Williamsburg 
neighborhood 
. 
The 
order 
covers 
four 
Zip 
codes 
there 
, 
Mayor 
Bill 
de 
Blasio 
( 
D 
) 
said 
Tuesday 
. 


 
The 
mandate 
orders 
all 
unvaccinated 
people 
in 
the 
area 
, 
including 
a 
concentration 
of 
Orthodox 
Jews 
, 
to 
receive 
inoculations 
, 
including 
for 
children 
as 
young 
as 
6 
months 
old 
. 
Anyone 
who 
resists 
could 
be 
fined 
up 
to 
$ 
1,000 
. 


In [6]:
displacy.render(nytimes, style="ent", jupyter=True)


NameError: name 'displacy' is not defined

In [None]:
dangerous = nlp(u'dangerous')
print(dangerous.vector.shape)
print(dangerous.vector)
