**spaCy** is a library for advanced Natural Language Processing designed in 2015 in Python and Cython. This contribution is using the basic Spacy operations with german lexicon.

https://spacy.io/

In [1]:
# import libraries
import re
import pandas as pd
import bs4
import requests
import spacy
from spacy import displacy

from spacy.matcher import Matcher
from spacy.tokens import Span

import networkx as nx

import matplotlib.pyplot as plt
from tqdm import tqdm

pd.set_option('display.max_colwidth', 200)
%matplotlib inline

In [2]:
# !python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm
!python -m spacy download de_core_web_sm
# spacy_en = spacy.load('en_core_web_sm')
# spacy_de = spacy.load('de_core_news_sm')

Collecting de-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.8.0/de_core_news_sm-3.8.0-py3-none-any.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m63.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: de-core-news-sm
Successfully installed de-core-news-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.

[38;5;1m✘ No compatible package found for 'de_core_web_sm' (spaCy v3.8.7)[0m



1. **Use sample sentences in Spacy**

In [3]:
# Classify each word in a sentence to verb, noun, adjectivum, ...
# Exact sentence example from module sentences, 3rd position from example

from spacy.lang.de.examples import sentences

nlp = spacy.load("de_core_news_sm")

doc = nlp(sentences[3])

print(doc.text)

for token in doc:
    print(token.text, token.pos_, token.dep_)

Bundesanwaltschaft erhebt Anklage gegen mutmaßlichen Schweizer Spion
Bundesanwaltschaft NOUN sb
erhebt VERB ROOT
Anklage NOUN oa
gegen ADP mnr
mutmaßlichen ADJ nk
Schweizer ADJ nk
Spion NOUN nk


In [4]:
sentences

['Die ganze Stadt ist ein Startup: Shenzhen ist das Silicon Valley für Hardware-Firmen',
 'Wie deutsche Startups die Technologie vorantreiben wollen: Künstliche Intelligenz',
 'Trend zum Urlaub in Deutschland beschert Gastwirten mehr Umsatz',
 'Bundesanwaltschaft erhebt Anklage gegen mutmaßlichen Schweizer Spion',
 'San Francisco erwägt Verbot von Lieferrobotern',
 'Autonome Fahrzeuge verlagern Haftpflicht auf Hersteller',
 'Wo bist du?',
 'Was ist die Hauptstadt von Deutschland?']

**2. Split Text sentences to lines and save it to csv using numpy np.**

In [5]:
#import re

text = "Die ganze Stadt ist ein Startup: Shenzhen ist das Silicon Valley für Hardware-Firmen. Wie deutsche Startups die Technologie vorantreiben wollen: Künstliche Intelligenz! Trend zum Urlaub in Deutschland beschert Gastwirten mehr Umsatz. Bundesanwaltschaft erhebt Anklage gegen mutmaßlichen Schweizer Spion. San Francisco erwägt Verbot von Lieferrobotern. Autonome Fahrzeuge verlagern Haftpflicht auf Hersteller. Wo bist du? Was ist die Hauptstadt von Deutschland?. Komm hier! San Francisco erwägt Verbot von Lieferrobotern. Autonome Fahrzeuge verlagern Haftpflicht auf Hersteller. Wo bist du?"
text2= re.split(r"[.!?]", text)
text2
import numpy as np
np.savetxt("text2.csv", text2, delimiter="\n", fmt='%s')

**3. Download Sentences from csv file**

In [22]:
# import wikipedia sentences, they have 4318 rows
text_sentences = pd.read_csv("text2.csv", header=None)
text_sentences.shape

(12, 1)

In [23]:
text_sentences

Unnamed: 0,0
0,Die ganze Stadt ist ein Startup: Shenzhen ist das Silicon Valley für Hardware-Firmen
1,Wie deutsche Startups die Technologie vorantreiben wollen: Künstliche Intelligenz
2,Trend zum Urlaub in Deutschland beschert Gastwirten mehr Umsatz
3,Bundesanwaltschaft erhebt Anklage gegen mutmaßlichen Schweizer Spion
4,San Francisco erwägt Verbot von Lieferrobotern
5,Autonome Fahrzeuge verlagern Haftpflicht auf Hersteller
6,Wo bist du
7,Was ist die Hauptstadt von Deutschland
8,Komm hier
9,San Francisco erwägt Verbot von Lieferrobotern


In [None]:
# see the sample of 2 sentences

In [24]:
text_sentences.sample(7)

Unnamed: 0,0
7,Was ist die Hauptstadt von Deutschland
8,Komm hier
6,Wo bist du
4,San Francisco erwägt Verbot von Lieferrobotern
9,San Francisco erwägt Verbot von Lieferrobotern
3,Bundesanwaltschaft erhebt Anklage gegen mutmaßlichen Schweizer Spion
2,Trend zum Urlaub in Deutschland beschert Gastwirten mehr Umsatz


**4. Tokenize sentences into words**

In [25]:

doc = nlp("Bundesanwaltschaft erhebt Anklage gegen mutmaßlichen Schweizer Spion. Komm hier! San Francisco erwägt Verbot von Lieferrobotern. Autonome Fahrzeuge verlagern Haftpflicht auf Hersteller. Wo bist du?")

for tok in doc:
  print(tok.text, "...", tok.dep_)

Bundesanwaltschaft ... sb
erhebt ... ROOT
Anklage ... oa
gegen ... mnr
mutmaßlichen ... nk
Schweizer ... nk
Spion ... nk
. ... punct
Komm ... ROOT
hier ... mo
! ... punct
San ... pnc
Francisco ... sb
erwägt ... ROOT
Verbot ... oa
von ... pg
Lieferrobotern ... nk
. ... punct
Autonome ... nk
Fahrzeuge ... sb
verlagern ... ROOT
Haftpflicht ... oa
auf ... mo
Hersteller ... nk
. ... punct
Wo ... mo
bist ... ROOT
du ... sb
? ... punct


In [35]:
GLOSSARY = {
    "ADJ": "adjective",
    "ADP": "adposition",
    "ADV": "adverb",
    "AUX": "auxiliary",
    "CONJ": "conjunction",
    "CCONJ": "coordinating conjunction",
    "DET": "determiner",
    "INTJ": "interjection",
    "NOUN": "noun",
    "NUM": "numeral",
    "PART": "particle",
    "PRON": "pronoun",
    "PROPN": "proper noun",
    "PUNCT": "punctuation",
    "SCONJ": "subordinating conjunction",
    "SYM": "symbol",
    "VERB": "verb",
    "X": "other",
    "EOL": "end of line",
    "SPACE": "space",
    ".": "punctuation mark, sentence closer",
    ",": "punctuation mark, comma",
    "-LRB-": "left round bracket",
    "-RRB-": "right round bracket",
    "``": "opening quotation mark",
    '""': "closing quotation mark",
    "''": "closing quotation mark",
    ":": "punctuation mark, colon or ellipsis",
    "$": "symbol, currency",
    "#": "symbol, number sign",
    "AFX": "affix",
    "CC": "conjunction, coordinating",
    "CD": "cardinal number",
    "DT": "determiner",
    "EX": "existential there",
    "FW": "foreign word",
    "HYPH": "punctuation mark, hyphen",
    "IN": "conjunction, subordinating or preposition",
    "JJ": "adjective (English), other noun-modifier (Chinese)",
    "JJR": "adjective, comparative",
    "JJS": "adjective, superlative",
    "LS": "list item marker",
    "MD": "verb, modal auxiliary",
    "NIL": "missing tag",
    "NN": "noun, singular or mass",
    "NNP": "noun, proper singular",
    "NNPS": "noun, proper plural",
    "NNS": "noun, plural",
    "PDT": "predeterminer",
    "POS": "possessive ending",
    "PRP": "pronoun, personal",
    "PRP$": "pronoun, possessive",
    "RB": "adverb",
    "RBR": "adverb, comparative",
    "RBS": "adverb, superlative",
    "RP": "adverb, particle",
    "TO": 'infinitival "to"',
    "UH": "interjection",
    "VB": "verb, base form",
    "VBD": "verb, past tense",
    "VBG": "verb, gerund or present participle",
    "VBN": "verb, past participle",
    "VBP": "verb, non-3rd person singular present",
    "VBZ": "verb, 3rd person singular present",
    "WDT": "wh-determiner",
    "WP": "wh-pronoun, personal",
    "WP$": "wh-pronoun, possessive",
    "WRB": "wh-adverb",
    "SP": "space (English), sentence-final particle (Chinese)",
    "ADD": "email",
    "NFP": "superfluous punctuation",
    "GW": "additional word in multi-word expression",
    "XX": "unknown",
    "BES": 'auxiliary "be"',
    "HVS": 'forms of "have"',
    "_SP": "whitespace",
    # POS Tags (German)
    # TIGER Treebank
    # http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/tiger_introduction.pdf
    "$(": "other sentence-internal punctuation mark",
    "$,": "comma",
    "$.": "sentence-final punctuation mark",
    "ADJA": "adjective, attributive",
    "ADJD": "adjective, adverbial or predicative",
    "APPO": "postposition",
    "APPR": "preposition; circumposition left",
    "APPRART": "preposition with article",
    "APZR": "circumposition right",
    "ART": "definite or indefinite article",
    "CARD": "cardinal number",
    "FM": "foreign language material",
    "ITJ": "interjection",
    "KOKOM": "comparative conjunction",
    "KON": "coordinate conjunction",
    "KOUI": 'subordinate conjunction with "zu" and infinitive',
    "KOUS": "subordinate conjunction with sentence",
    "NE": "proper noun",
    "NNE": "proper noun",
    "PAV": "pronominal adverb",
    "PROAV": "pronominal adverb",
    "PDAT": "attributive demonstrative pronoun",
    "PDS": "substituting demonstrative pronoun",
    "PIAT": "attributive indefinite pronoun without determiner",
    "PIDAT": "attributive indefinite pronoun with determiner",
    "PIS": "substituting indefinite pronoun",
    "PPER": "non-reflexive personal pronoun",
    "PPOSAT": "attributive possessive pronoun",
    "PPOSS": "substituting possessive pronoun",
    "PRELAT": "attributive relative pronoun",
    "PRELS": "substituting relative pronoun",
    "PRF": "reflexive personal pronoun",
    "PTKA": "particle with adjective or adverb",
    "PTKANT": "answer particle",
    "PTKNEG": "negative particle",
    "PTKVZ": "separable verbal particle",
    "PTKZU": '"zu" before infinitive',
    "PWAT": "attributive interrogative pronoun",
    "PWAV": "adverbial interrogative or relative pronoun",
    "PWS": "substituting interrogative pronoun",
    "TRUNC": "word remnant",
    "VAFIN": "finite verb, auxiliary",
    "VAIMP": "imperative, auxiliary",
    "VAINF": "infinitive, auxiliary",
    "VAPP": "perfect participle, auxiliary",
    "VMFIN": "finite verb, modal",
    "VMINF": "infinitive, modal",
    "VMPP": "perfect participle, modal",
    "VVFIN": "finite verb, full",
    "VVIMP": "imperative, full",
    "VVINF": "infinitive, full",
    "VVIZU": 'infinitive with "zu", full',
    "VVPP": "perfect participle, full",
    "XY": "non-word containing non-letter",
    # POS Tags (Chinese)
    # OntoNotes / Chinese Penn Treebank
    # https://repository.upenn.edu/cgi/viewcontent.cgi?article=1039&context=ircs_reports
    "AD": "adverb",
    "AS": "aspect marker",
    "BA": "把 in ba-construction",
    # "CD": "cardinal number",
    "CS": "subordinating conjunction",
    "DEC": "的 in a relative clause",
    "DEG": "associative 的",
    "DER": "得 in V-de const. and V-de-R",
    "DEV": "地 before VP",
    "ETC": "for words 等, 等等",
    # "FW": "foreign words"
    "IJ": "interjection",
    # "JJ": "other noun-modifier",
    "LB": "被 in long bei-const",
    "LC": "localizer",
    "M": "measure word",
    "MSP": "other particle",
    # "NN": "common noun",
    "NR": "proper noun",
    "NT": "temporal noun",
    "OD": "ordinal number",
    "ON": "onomatopoeia",
    "P": "preposition excluding 把 and 被",
    "PN": "pronoun",
    "PU": "punctuation",
    "SB": "被 in short bei-const",
    # "SP": "sentence-final particle",
    "VA": "predicative adjective",
    "VC": "是 (copula)",
    "VE": "有 as the main verb",
    "VV": "other verb",
    # Noun chunks
    "NP": "noun phrase",
    "PP": "prepositional phrase",
    "VP": "verb phrase",
    "ADVP": "adverb phrase",
    "ADJP": "adjective phrase",
    "SBAR": "subordinating conjunction",
    "PRT": "particle",
    "PNP": "prepositional noun phrase",
    # Dependency Labels (English)
    # ClearNLP / Universal Dependencies
    # https://github.com/clir/clearnlp-guidelines/blob/master/md/specifications/dependency_labels.md
    "acl": "clausal modifier of noun (adjectival clause)",
    "acomp": "adjectival complement",
    "advcl": "adverbial clause modifier",
    "advmod": "adverbial modifier",
    "agent": "agent",
    "amod": "adjectival modifier",
    "appos": "appositional modifier",
    "attr": "attribute",
    "aux": "auxiliary",
    "auxpass": "auxiliary (passive)",
    "case": "case marking",
    "cc": "coordinating conjunction",
    "ccomp": "clausal complement",
    "clf": "classifier",
    "complm": "complementizer",
    "compound": "compound",
    "conj": "conjunct",
    "cop": "copula",
    "csubj": "clausal subject",
    "csubjpass": "clausal subject (passive)",
    "dative": "dative",
    "dep": "unclassified dependent",
    "det": "determiner",
    "discourse": "discourse element",
    "dislocated": "dislocated elements",
    "dobj": "direct object",
    "expl": "expletive",
    "fixed": "fixed multiword expression",
    "flat": "flat multiword expression",
    "goeswith": "goes with",
    "hmod": "modifier in hyphenation",
    "hyph": "hyphen",
    "infmod": "infinitival modifier",
    "intj": "interjection",
    "iobj": "indirect object",
    "list": "list",
    "mark": "marker",
    "meta": "meta modifier",
    "neg": "negation modifier",
    "nmod": "modifier of nominal",
    "nn": "noun compound modifier",
    "npadvmod": "noun phrase as adverbial modifier",
    "nsubj": "nominal subject",
    "nsubjpass": "nominal subject (passive)",
    "nounmod": "modifier of nominal",
    "npmod": "noun phrase as adverbial modifier",
    "num": "number modifier",
    "number": "number compound modifier",
    "nummod": "numeric modifier",
    "oprd": "object predicate",
    "obj": "object",
    "obl": "oblique nominal",
    "orphan": "orphan",
    "parataxis": "parataxis",
    "partmod": "participal modifier",
    "pcomp": "complement of preposition",
    "pobj": "object of preposition",
    "poss": "possession modifier",
    "possessive": "possessive modifier",
    "preconj": "pre-correlative conjunction",
    "prep": "prepositional modifier",
    "prt": "particle",
    "punct": "punctuation",
    "quantmod": "modifier of quantifier",
    "rcmod": "relative clause modifier",
    "relcl": "relative clause modifier",
    "reparandum": "overridden disfluency",
    "root": "root",
    "ROOT": "root",
    "vocative": "vocative",
    "xcomp": "open clausal complement",
    # Dependency labels (German)
    # TIGER Treebank
    # http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/tiger_introduction.pdf
    # currently missing: 'cc' (comparative complement) because of conflict
    # with English labels
    "ac": "adpositional case marker",
    "adc": "adjective component",
    "ag": "genitive attribute",
    "ams": "measure argument of adjective",
    "app": "apposition",
    "avc": "adverbial phrase component",
    "cd": "coordinating conjunction",
    "cj": "conjunct",
    "cm": "comparative conjunction",
    "cp": "complementizer",
    "cvc": "collocational verb construction",
    "da": "dative",
    "dh": "discourse-level head",
    "dm": "discourse marker",
    "ep": "expletive es",
    "hd": "head",
    "ju": "junctor",
    "mnr": "postnominal modifier",
    "mo": "modifier",
    "ng": "negation",
    "nk": "noun kernel element",
    "nmc": "numerical component",
    "oa": "accusative object",
    "oc": "clausal object",
    "og": "genitive object",
    "op": "prepositional object",
    "par": "parenthetical element",
    "pd": "predicate",
    "pg": "phrasal genitive",
    "ph": "placeholder",
    "pm": "morphological particle",
    "pnc": "proper noun component",
    "rc": "relative clause",
    "re": "repeated element",
    "rs": "reported speech",
    "sb": "subject",
    "sbp": "passivized subject (PP)",
    "sp": "subject or predicate",
    "svp": "separable verb prefix",
    "uc": "unit component",
    "vo": "vocative",
    "PERSON": "People, including fictional",
    "NORP": "Nationalities or religious or political groups",
    "FACILITY": "Buildings, airports, highways, bridges, etc.",
    "FAC": "Buildings, airports, highways, bridges, etc.",
    "ORG": "Companies, agencies, institutions, etc.",
    "GPE": "Countries, cities, states",
    "LOC": "Non-GPE locations, mountain ranges, bodies of water",
    "PRODUCT": "Objects, vehicles, foods, etc. (not services)",
    "EVENT": "Named hurricanes, battles, wars, sports events, etc.",
    "WORK_OF_ART": "Titles of books, songs, etc.",
    "LAW": "Named documents made into laws.",
    "LANGUAGE": "Any named language",
    "DATE": "Absolute or relative dates or periods",
    "TIME": "Times smaller than a day",
    "PERCENT": 'Percentage, including "%"',
    "MONEY": "Monetary values, including unit",
    "QUANTITY": "Measurements, as of weight or distance",
    "ORDINAL": '"first", "second", etc.',
    "CARDINAL": "Numerals that do not fall under another type",
    "PER": "Named person or family.",
    "MISC": "Miscellaneous entities, e.g. events, nationalities, products or works of art",
    "EVT": "Festivals, cultural events, sports events, weather phenomena, wars, etc.",
    "PROD": "Product, i.e. artificially produced entities including speeches, radio shows, programming languages, contracts, laws and ideas",
    "DRV": "Words (and phrases?) that are dervied from a name, but not a name in themselves, e.g. 'Oslo-mannen' ('the man from Oslo')",
    "GPE_LOC": "Geo-political entity, with a locative sense, e.g. 'John lives in Spain'",
    "GPE_ORG": "Geo-political entity, with an organisation sense, e.g. 'Spain declined to meet with Belgium'",
}

**5. extract entity pairs**

In [26]:
def get_entities(text):

  ## chunk 1
  ent1 = ""
  ent2 = ""

  prv_tok_dep = ""    # dependency tag of previous token in the sentence
  prv_tok_text = ""   # previous token in the sentence

  prefix = ""
  modifier = ""

  #############################################################

  for tok in nlp(text):
    ## chunk 2
    # if token is a punctuation mark then move on to the next token
    if tok.dep_ != "punct":
      # check: token is a compound word or not
      if tok.dep_ == "compound":
        prefix = tok.text
        # if the previous word was also a 'compound' then add the current word to it
        if prv_tok_dep == "compound":
          prefix = prv_tok_text + " "+ tok.text

      # check: token is a modifier or not
      if tok.dep_.endswith("mod") == True:
        modifier = tok.text
        # if the previous word was also a 'compound' then add the current word to it
        if prv_tok_dep == "compound":
          modifier = prv_tok_text + " "+ tok.text

      ## chunk 3
      if tok.dep_.find("subj") == True:
        ent1 = modifier +" "+ prefix + " "+ tok.text
        prefix = ""
        modifier = ""
        prv_tok_dep = ""
        prv_tok_text = ""

      ## chunk 4
      if tok.dep_.find("obj") == True:
        ent2 = modifier +" "+ prefix +" "+ tok.text

      ## chunk 5
      # update variables
      prv_tok_dep = tok.dep_
      prv_tok_text = tok.text

   #############################################################

  return [ent1.strip(), ent2.strip()]

**6. Predicate extraction**

In [27]:
def get_relation(text):

  doc = nlp(text)

  # Matcher class object
  matcher = Matcher(nlp.vocab)

  #define the pattern
  pattern = [{'DEP':'ROOT'},
            {'DEP':'prep','OP':"?"},
            {'DEP':'agent','OP':"?"},
            {'POS':'ADJ','OP':"?"}]

  matcher.add("matching_1", [pattern], on_match=None)

  matches = matcher(doc)
  k = len(matches) - 1

  span = doc[matches[k][1]:matches[k][2]]

  return(span.text)

In [28]:
# verb
get_relation("Die ganze Stadt ist ein Startup: Shenzhen ist das Silicon Valley für Hardware-Firmen. Wie deutsche Startups die Technologie vorantreiben wollen: Künstliche Intelligenz! Trend zum Urlaub in Deutschland beschert Gastwirten mehr Umsatz. Bundesanwaltschaft erhebt Anklage gegen mutmaßlichen Schweizer Spion. San Francisco erwägt Verbot von Lieferrobotern. Autonome Fahrzeuge verlagern Haftpflicht auf Hersteller. Wo bist du? Was ist die Hauptstadt von Deutschland?. Komm hier! San Francisco erwägt Verbot von Lieferrobotern. Autonome Fahrzeuge verlagern Haftpflicht auf Hersteller. Wo bist du?")

'bist'