In [None]:
import spacy

nlp = spacy.load('en_core_web_sm')


In [None]:
intro_text = ('This tutorial is about Natural Language Processing in Spacy.')
introduction_doc = nlp(intro_text)
# Extract tokens for the given doc
print([token.text for token in introduction_doc])

In [None]:
file_name = 'SiemensP23.txt'
text_file = open(file_name).read()
doc = nlp(text_file)
print('Number of tokens:', len(doc))

# Extract tokens for the given doc
print([token.text for token in doc])

In [None]:
# the sents property is used to extract sentences
sentences = list(doc.sents)
print('Anzahl sentences:', len(sentences))
print('Satz 1:', sentences[27])
print('Sentence is of type: ', type(sentences[27]))
print('Size of "sentences":', len(sentences))

In [None]:
token = doc[2]
print('The token:', token)
print('Token (word) starts at character position:', token.idx)
print('token text with trailing space (if present):', token.text_with_ws)
print('if the token consists of alphabetic characters or not:', token.is_alpha)
print('if the token is a punctuation symbol or not:', token.is_punct)
print('if the token is a space or not:', token.is_space)
print('prints out the shape of the word', token.shape_)
print('if the token is a stop word or not:', token.is_stop)

In [None]:
# Get rid of stop words
no_stopword_doc = [token for token in doc if not token.is_stop]
print(no_stopword_doc)

In [None]:
# Lemmatization: organizes, organized and organizing are all forms of organize. Here, organize is the lemma.
for token in no_stopword_doc[:30]:
    print(token, token.lemma_)

In [None]:
from collections import Counter

word_freq = Counter(doc)
print(word_freq)

In [None]:
# commonly occurring words with their frequencies
num = 10
common_words = word_freq.most_common(num)
print(num, ' most common words:', common_words)

# Unique words
unique_words = [word for (word, freq) in word_freq.items() if freq == 1]
print('Unique words:', unique_words)

In [None]:
# Statistics
from collections import Counter

# Remove stop words and punctuation symbols
words = [token.text for token in doc
         if not token.is_stop and not token.is_punct]
word_freq = Counter(words)

# 5 commonly occurring words with their frequencies
common_words = word_freq.most_common(5)
print(common_words)

# Unique words
unique_words = [word for (word, freq) in word_freq.items() if freq == 1]
print(unique_words)


In [None]:
# Part of speech tagging is the process of assigning a POS tag to each token depending on its usage in the sentence.
# POS tags are useful for assigning a syntactic category like noun or verb to each word.
for token in doc[:20]:
    print(token, token.tag_, token.pos_, spacy.explain(token.tag_))

In [None]:
# # Visualization: Using displaCy
# from spacy import displacy
# displacy.serve(sentences[27], style='dep')

In [None]:
# # Matching
# from spacy.matcher import Matcher
# matcher = Matcher(nlp.vocab)
# conference_org_text = str('There is a developer conference'
#     'happening on 21 July 2019 in London. It is titled'
#     ' "Applications of Natural Language Processing".'
#     ' There is a helpline number available'
#     ' at (123) 456-789')
#
# def extract_phone_number(nlp_doc):
#     pattern = [{'ORTH': '('}, {'SHAPE': 'ddd'},
#                {'ORTH': ')'}, {'SHAPE': 'ddd'},
#                {'ORTH': '-', 'OP': '?'},
#                {'SHAPE': 'ddd'}]
#     matcher.add(key='PHONE_NUMBER', patterns=pattern)
#     matches = matcher(nlp_doc)
#     for match_id, start, end in matches:
#         span = nlp_doc[start:end]
#         return span.text
#
# conference_org_doc = nlp(conference_org_text)
# extract_phone_number(conference_org_doc)

In [None]:
# Named Entity Recognition (NER) is the process of locating named entities in unstructured text and
# then classifying them into pre-defined categories
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char,
          ent.label_, spacy.explain(ent.label_))

In [None]:
from spacy import displacy

displacy.serve(doc[:1000], style='ent')

