In [17]:
import spacy
nlp = spacy.load('en_core_web_sm')


In [18]:
intro_text = ('This tutorial is about Natural'
    ' Language Processing in Spacy.')
introduction_doc = nlp(intro_text)
# Extract tokens for the given doc
print ([token.text for token in introduction_doc])

['This', 'tutorial', 'is', 'about', 'Natural', 'Language', 'Processing', 'in', 'Spacy', '.']


In [None]:
file_name = 'SiemensP23.txt'
text_file = open(file_name).read()
doc = nlp(text_file)
print('Number of tokens:', len(doc))

# Extract tokens for the given doc
print ([token.text for token in doc])

In [20]:
# the sents property is used to extract sentences
sentences = list(doc.sents)
print('Anzahl sentences:', len(sentences))
print('Satz 1:', sentences[27])
print('Sentence is of type: ', type(sentences[27]))
print('Size of "sentences":', len(sentences))

Anzahl sentences: 28
Satz 1: Our goals
Our status
Climate neutral in
own operations until
2030
292,000 t
CO2 emissions
(Scope 1 and 2) 78%
green electricity
100%
green electricity
consumption
by 2023
Three questions toâ€¦
23
Siemens Energy â€• Sustainability Report 2020
Decarbonizing our business
Sentence is of type:  <class 'spacy.tokens.span.Span'>
Size of "sentences": 28


In [21]:
token = doc[2]
print('The token:',token)
print('Token (word) starts at character position:',token.idx)
print('token text with trailing space (if present):',token.text_with_ws)
print('if the token consists of alphabetic characters or not:',token.is_alpha)
print('if the token is a punctuation symbol or not:',token.is_punct)
print('if the token is a space or not:',token.is_space)
print('prints out the shape of the word',token.shape_)
print('if the token is a stop word or not:',token.is_stop)

The token: is
Token (word) starts at character position: 15
token text with trailing space (if present): is 
if the token consists of alphabetic characters or not: True
if the token is a punctuation symbol or not: False
if the token is a space or not: False
prints out the shape of the word xx
if the token is a stop word or not: True


In [22]:
# Get rid of stop words
no_stopword_doc = [token for token in doc if not token.is_stop]
print (no_stopword_doc)

[Siemens, Energy, partner, choice, decarbonization, journey, 
, committed, accompanying, customers, way, sustainable, energy, future, ., 
, driving, decarbonization, entire, value, chain, :, supply, chain, 
, operations, ,, especially, portfolio, ., ,, aim, partner, choice, 
, customers, support, transition, sustainable, energy, world, ., 
, Siemens, Energy, support, EUâ€, ™, s, 2050, climate, neutrality, goal, business, strategy, ?, 
, walk, talk, :, Siemens, Energy, committed, climate, neutral, 2030, ., ,, ,, 
, support, customers, individual, journeys, decarbonization, ., products, ,, solutions, 
, services, ,, push, transition, sustainable, energy, world, ., covering, 
, entire, value, chain, innovative, technologies, ,, able, turn, ideas, reality, ., 
, priority, solutions, comes, decarbonization, ?, 
, critical, drive, future, technologies, ,, 
, time, ,, courage, choose, interim, 
, solutions, ., shift, coal, natural, gas, 
, interim, solution, way, climate, -, neutral, 
, econo

In [23]:
# Lemmatization: organizes, organized and organizing are all forms of organize. Here, organize is the lemma.
for token in no_stopword_doc[:30]:
    print (token, token.lemma_)

Siemens Siemens
Energy Energy
partner partner
choice choice
decarbonization decarbonization
journey journey

 

committed committed
accompanying accompany
customers customer
way way
sustainable sustainable
energy energy
future future
. .

 

driving drive
decarbonization decarbonization
entire entire
value value
chain chain
: :
supply supply
chain chain

 

operations operation
, ,
especially especially
portfolio portfolio
. .


In [24]:
from collections import Counter
word_freq = Counter(doc)
print(word_freq)

Counter({Siemens: 1, Energy: 1, is: 1, the: 1, partner: 1, of: 1, choice: 1, for: 1, the: 1, decarbonization: 1, journey: 1, 
: 1, We: 1, are: 1, committed: 1, to: 1, accompanying: 1, our: 1, customers: 1, on: 1, their: 1, way: 1, to: 1, a: 1, more: 1, sustainable: 1, energy: 1, future: 1, .: 1, 
: 1, We: 1, are: 1, driving: 1, decarbonization: 1, along: 1, the: 1, entire: 1, value: 1, chain: 1, :: 1, from: 1, the: 1, supply: 1, chain: 1, to: 1, our: 1, own: 1, 
: 1, operations: 1, ,: 1, and: 1, especially: 1, in: 1, our: 1, portfolio: 1, .: 1, In: 1, doing: 1, so: 1, ,: 1, we: 1, aim: 1, to: 1, be: 1, the: 1, partner: 1, of: 1, choice: 1, for: 1, our: 1, 
: 1, customers: 1, and: 1, to: 1, support: 1, them: 1, in: 1, their: 1, transition: 1, to: 1, a: 1, sustainable: 1, energy: 1, world: 1, .: 1, 
: 1, How: 1, does: 1, Siemens: 1, Energy: 1, support: 1, the: 1, EUâ€: 1, ™: 1, s: 1, 2050: 1, climate: 1, neutrality: 1, goal: 1, through: 1, its: 1, business: 1, strategy: 1, ?: 1, 
: 1, We

In [25]:
# commonly occurring words with their frequencies
num = 10
common_words = word_freq.most_common(num)
print (num, ' most common words:', common_words)

# Unique words
unique_words = [word for (word, freq) in word_freq.items() if freq == 1]
print ('Unique words:', unique_words)

10  most common words: [(Siemens, 1), (Energy, 1), (is, 1), (the, 1), (partner, 1), (of, 1), (choice, 1), (for, 1), (the, 1), (decarbonization, 1)]
Unique words: [Siemens, Energy, is, the, partner, of, choice, for, the, decarbonization, journey, 
, We, are, committed, to, accompanying, our, customers, on, their, way, to, a, more, sustainable, energy, future, ., 
, We, are, driving, decarbonization, along, the, entire, value, chain, :, from, the, supply, chain, to, our, own, 
, operations, ,, and, especially, in, our, portfolio, ., In, doing, so, ,, we, aim, to, be, the, partner, of, choice, for, our, 
, customers, and, to, support, them, in, their, transition, to, a, sustainable, energy, world, ., 
, How, does, Siemens, Energy, support, the, EUâ€, ™, s, 2050, climate, neutrality, goal, through, its, business, strategy, ?, 
, We, walk, the, talk, :, Siemens, Energy, has, committed, to, becoming, climate, neutral, by, 2030, ., And, ,, moreover, ,, we, 
, support, our, customers, on, thei

In [26]:
# Statistics
from collections import Counter

# Remove stop words and punctuation symbols
words = [token.text for token in doc
         if not token.is_stop and not token.is_punct]
word_freq = Counter(words)

# 5 commonly occurring words with their frequencies
common_words = word_freq.most_common(5)
print (common_words)

# Unique words
unique_words = [word for (word, freq) in word_freq.items() if freq == 1]
print (unique_words)


[('\n', 51), ('Siemens', 8), ('Energy', 7), ('energy', 7), ('decarbonization', 6)]
['journey', 'accompanying', 'driving', 'supply', 'especially', 'portfolio', 'aim', 'EUâ€', '2050', 'neutrality', 'goal', 'strategy', 'walk', 'talk', 'individual', 'journeys', 'products', 'services', 'push', 'covering', 'innovative', 'able', 'turn', 'ideas', 'reality', 'priority', 'critical', 'drive', 'time', 'courage', 'choose', 'shift', 'coal', 'natural', 'gas', 'solution', 'economy', 'Combining', 'conventional', 'renewable', 'systems', 'key', 'meeting', 'worldâ€', 'reliable', 'affordable', 'Green', 'play', 'major', 'role', 'mix', 'scale', 'volumes', 'bring', 'costs', 'policy', 'task', 'define', 'implement', 'boundary', 'conditions', 'envision', 'Energyâ€', 'positioning', 'regard', 'convinced', 'track', 'reason', 'confident', 'company', 'immediately', 'thinks', 'energize', 'society', 'â€', 'thatâ€', 'purpose', 'partners', 'real', 'difference', 'shaping', 'tomorrow', 'Christian', 'Bruch', 'CEO', 'CSO', '

In [27]:
# Part of speech tagging is the process of assigning a POS tag to each token depending on its usage in the sentence.
# POS tags are useful for assigning a syntactic category like noun or verb to each word.
for token in doc[:20]:
    print (token, token.tag_, token.pos_, spacy.explain(token.tag_))

Siemens NNP PROPN noun, proper singular
Energy NNP PROPN noun, proper singular
is VBZ AUX verb, 3rd person singular present
the DT DET determiner
partner NN NOUN noun, singular or mass
of IN ADP conjunction, subordinating or preposition
choice NN NOUN noun, singular or mass
for IN ADP conjunction, subordinating or preposition
the DT DET determiner
decarbonization NN NOUN noun, singular or mass
journey NNP PROPN noun, proper singular

 _SP SPACE whitespace
We PRP PRON pronoun, personal
are VBP AUX verb, non-3rd person singular present
committed JJ ADJ adjective (English), other noun-modifier (Chinese)
to IN ADP conjunction, subordinating or preposition
accompanying VBG VERB verb, gerund or present participle
our PRP$ PRON pronoun, possessive
customers NNS NOUN noun, plural
on IN ADP conjunction, subordinating or preposition


In [28]:
# # Visualization: Using displaCy
# from spacy import displacy
# displacy.serve(sentences[27], style='dep')

In [29]:
# # Matching
# from spacy.matcher import Matcher
# matcher = Matcher(nlp.vocab)
# conference_org_text = str('There is a developer conference'
#     'happening on 21 July 2019 in London. It is titled'
#     ' "Applications of Natural Language Processing".'
#     ' There is a helpline number available'
#     ' at (123) 456-789')
#
# def extract_phone_number(nlp_doc):
#     pattern = [{'ORTH': '('}, {'SHAPE': 'ddd'},
#                {'ORTH': ')'}, {'SHAPE': 'ddd'},
#                {'ORTH': '-', 'OP': '?'},
#                {'SHAPE': 'ddd'}]
#     matcher.add(key='PHONE_NUMBER', patterns=pattern)
#     matches = matcher(nlp_doc)
#     for match_id, start, end in matches:
#         span = nlp_doc[start:end]
#         return span.text
#
# conference_org_doc = nlp(conference_org_text)
# extract_phone_number(conference_org_doc)

In [30]:
# Named Entity Recognition (NER) is the process of locating named entities in unstructured text and
# then classifying them into pre-defined categories
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char,
          ent.label_, spacy.explain(ent.label_))

Siemens Energy 0 14 ORG Companies, agencies, institutions, etc.
Siemens Energy 454 468 ORG Companies, agencies, institutions, etc.
2050 488 492 DATE Absolute or relative dates or periods
Siemens Energy 566 580 ORG Companies, agencies, institutions, etc.
2030 626 630 DATE Absolute or relative dates or periods
one 1180 1183 CARDINAL Numerals that do not fall under another type
Siemens 1624 1631 ORG Companies, agencies, institutions, etc.
Siemens Energy 1714 1728 ORG Companies, agencies, institutions, etc.
Siemens Energy 1794 1808 ORG Companies, agencies, institutions, etc.
thatâ€ 1925 1931 PERSON People, including fictional
tomorrow 2050 2058 DATE Absolute or relative dates or periods
Christian Bruch 2060 2075 PERSON People, including fictional
CSO 2084 2087 ORG Companies, agencies, institutions, etc.
Siemens Energy 2091 2105 ORG Companies, agencies, institutions, etc.
2030 2167 2171 CARDINAL Numerals that do not fall under another type
292,000 2172 2179 CARDINAL Numerals that do not fal

In [15]:
from spacy import displacy
displacy.serve(doc[:1000], style='ent')



Shutting down server on port 5000.



Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

