In [1]:
import spacy
from spacy.lang.en import English

# Sentencizer
https://spacy.io/usage/linguistic-features#sbd

In [2]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("This is a sentence. This is another sentence.")
assert doc.has_annotation("SENT_START")
for sent in doc.sents:
    print(sent.text)

This is a sentence.
This is another sentence.


In [3]:
name = "../data/Original/iued_test_original.txt"
#name = "../data/Original/iued_test_original.vrt"
with open (name, "r") as myfile:
    data=myfile.read().replace('\n', '')

In [4]:
print(data)



In [5]:
doc = nlp(data)
assert doc.has_annotation("SENT_START")
for sent in doc.sents:
    print(sent.text)
    print('***')

As part of the product upgrade at the end of twenty thirteen , the Audi A eight gained a number of new features relating to driver assist systems .
***
These new features are the topic of this service T-V programme .
***
We will be looking at the following driver assist systems : Audi Audi active Lane assist , Audi park assist , Audi night vision assist , and the head up display .
***
The changeover from hydraulical steering to electromechanical steering now means that it is possible to offer Audi active lane assist in the Audi A
***
eight as well .
***
Due to the planned steering interventions , this system requires electromechanical steering .
***
Until the product upgrade took place on the Audi A eight , only Audi lane assist was offered on this model , but not the version that actively intervenes in the steering .
***
This version has been available for some time now on the Audi A six and Audi A seven model .
***
Audi active lane assist , which is now available in the Audi A eight 

This gives somewhat accurate results, with some errors after numbers. You can also use a trained model, however this will not work on uncommon texts.

In [6]:
doc = nlp(data)
for sent in doc.sents:
    print(sent.text)
    print('***')    

As part of the product upgrade at the end of twenty thirteen , the Audi A eight gained a number of new features relating to driver assist systems .
***
These new features are the topic of this service T-V programme .
***
We will be looking at the following driver assist systems : Audi Audi active Lane assist , Audi park assist , Audi night vision assist , and the head up display .
***
The changeover from hydraulical steering to electromechanical steering now means that it is possible to offer Audi active lane assist in the Audi A
***
eight as well .
***
Due to the planned steering interventions , this system requires electromechanical steering .
***
Until the product upgrade took place on the Audi A eight , only Audi lane assist was offered on this model , but not the version that actively intervenes in the steering .
***
This version has been available for some time now on the Audi A six and Audi A seven model .
***
Audi active lane assist , which is now available in the Audi A eight 

Also fails for the example here. Then there is the one based on a statistical model.

In [7]:
nlp.enable_pipe("senter")
doc = nlp(data)
for sent in doc.sents:
    print(sent.text)
    print('***')        

As part of the product upgrade at the end of twenty thirteen , the Audi A eight gained a number of new features relating to driver assist systems .
***
These new features are the topic of this service T-V programme .
***
We will be looking at the following driver assist systems : Audi Audi active Lane assist , Audi park assist , Audi night vision assist , and the head up display .
***
The changeover from hydraulical steering to electromechanical steering now means that it is possible to offer Audi active lane assist in the Audi A
***
eight as well .
***
Due to the planned steering interventions , this system requires electromechanical steering .
***
Until the product upgrade took place on the Audi A eight , only Audi lane assist was offered on this model , but not the version that actively intervenes in the steering .
***
This version has been available for some time now on the Audi A six and Audi A seven model .
***
Audi active lane assist , which is now available in the Audi A eight 

Directly use the sentencizer without the pipeline - this one looks at punctuation.

In [8]:
nlp = English()  # just the language with no pipeline
nlp.add_pipe("sentencizer")
doc = nlp(data)
for sent in doc.sents:
    print(sent.text)
    print('***')            

As part of the product upgrade at the end of twenty thirteen , the Audi A eight gained a number of new features relating to driver assist systems .
***
These new features are the topic of this service T-V programme .
***
We will be looking at the following driver assist systems : Audi Audi active Lane assist , Audi park assist , Audi night vision assist , and the head up display .
***
The changeover from hydraulical steering to electromechanical steering now means that it is possible to offer Audi active lane assist in the Audi A eight as well .
***
Due to the planned steering interventions , this system requires electromechanical steering .
***
Until the product upgrade took place on the Audi A eight , only Audi lane assist was offered on this model , but not the version that actively intervenes in the steering .
***
This version has been available for some time now on the Audi A six and Audi A seven model .
***
Audi active lane assist , which is now available in the Audi A eight , is

Seems to work correctly. What is the difference to the pipeline? In the DW scripts, the other components are disabled via the "exclude" command - should be faster as pipeline is not loaded at all.

In [9]:
texts = [
    "Net income was $9.4 million compared to the prior year of $2.7 million.",
    "Revenue exceeded twelve billion dollars, with a loss of $1b.",
]

nlp = spacy.load("en_core_web_sm")
for doc in nlp.pipe(texts, disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"]):
    # Do something with the doc here
    print([(ent.text, ent.label_) for ent in doc.ents])

[('$9.4 million', 'MONEY'), ('the prior year', 'DATE'), ('$2.7 million', 'MONEY')]
[('twelve billion dollars', 'MONEY'), ('1b', 'MONEY')]


In [10]:
nlp = spacy.load("en_core_web_sm", exclude=["tagger", "ner", "attribute_ruler", "lemmatizer"])
for doc in nlp.pipe(texts):
    for sent in doc.sents:
        print(sent.text)
        print('***') 

Net income was $9.4 million compared to the prior year of $2.7 million.
***
Revenue exceeded twelve billion dollars, with a loss of $1b.
***


# Tokenizer
https://spacy.io/usage/linguistic-features#tokenization  
We need to allow for special case rules. 
```
special_case = [{ORTH: "gim"}, {ORTH: "me"}]
nlp.tokenizer.add_special_case("gimme", special_case)
```

Also, there are custom tokenizer libraries that one may want to load. Probably we would want to keep it so that users can specify their custom tokenizers in addition to the standard one from spaCy.

In [11]:
doc = nlp(data)
for token in doc:
    print(token.text)

As
part
of
the
product
upgrade
at
the
end
of
twenty
thirteen
,
the
Audi
A
eight
gained
a
number
of
new
features
relating
to
driver
assist
systems
.
These
new
features
are
the
topic
of
this
service
T
-
V
programme
.
We
will
be
looking
at
the
following
driver
assist
systems
:
Audi
Audi
active
Lane
assist
,
Audi
park
assist
,
Audi
night
vision
assist
,
and
the
head
up
display
.
The
changeover
from
hydraulical
steering
to
electromechanical
steering
now
means
that
it
is
possible
to
offer
Audi
active
lane
assist
in
the
Audi
A
eight
as
well
.
Due
to
the
planned
steering
interventions
,
this
system
requires
electromechanical
steering
.
Until
the
product
upgrade
took
place
on
the
Audi
A
eight
,
only
Audi
lane
assist
was
offered
on
this
model
,
but
not
the
version
that
actively
intervenes
in
the
steering
.
This
version
has
been
available
for
some
time
now
on
the
Audi
A
six
and
Audi
A
seven
model
.
Audi
active
lane
assist
,
which
is
now
available
in
the
Audi
A
eight
,
is
very
similar
to
the
syste

# Lemmatizer
https://spacy.io/usage/linguistic-features#lemmatization

needs package spacy_lookups_data to run

In [12]:
lemmatizer = nlp.add_pipe("lemmatizer")  # need to be carefull which components are already in the pipeline or not. get_pipe() throws me an error when running this from the top
print(lemmatizer.mode)  # 'rule'
lemmatizer.initialize(lookups=None)
doc = nlp("I was reading the paper.")
print([token.lemma_ for token in doc])

rule
['i', 'was', 'reading', 'the', 'paper', '.']




In [13]:
doc = nlp(data)
print([token.lemma_ for token in doc])



Should punctuation be excluded?

# POS tagger
https://spacy.io/usage/linguistic-features#pos-tagging

In [14]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

Apple apple   nsubj Xxxxx True False
is is   aux xx True True
looking looking   ROOT xxxx True False
at at   prep xx True True
buying buying   pcomp xxxx True False
U.K. u.k.   compound X.X. False False
startup startup   dobj xxxx True False
for for   prep xxx True True
$ $   quantmod $ False False
1 1   compound d False False
billion billion   pobj xxxx True False


In [15]:
doc = nlp(data)
for token in doc:
    print(token.text, token.pos_, token.tag_)

As  
part  
of  
the  
product  
upgrade  
at  
the  
end  
of  
twenty  
thirteen  
,  
the  
Audi  
A  
eight  
gained  
a  
number  
of  
new  
features  
relating  
to  
driver  
assist  
systems  
.  
These  
new  
features  
are  
the  
topic  
of  
this  
service  
T  
-  
V  
programme  
.  
We  
will  
be  
looking  
at  
the  
following  
driver  
assist  
systems  
:  
Audi  
Audi  
active  
Lane  
assist  
,  
Audi  
park  
assist  
,  
Audi  
night  
vision  
assist  
,  
and  
the  
head  
up  
display  
.  
The  
changeover  
from  
hydraulical  
steering  
to  
electromechanical  
steering  
now  
means  
that  
it  
is  
possible  
to  
offer  
Audi  
active  
lane  
assist  
in  
the  
Audi  
A  
eight  
as  
well  
.  
Due  
to  
the  
planned  
steering  
interventions  
,  
this  
system  
requires  
electromechanical  
steering  
.  
Until  
the  
product  
upgrade  
took  
place  
on  
the  
Audi  
A  
eight  
,  
only  
Audi  
lane  
assist  
was  
offered  
on 

# Morphology
https://spacy.io/usage/linguistic-features#morphology

In [16]:
print("Pipeline:", nlp.pipe_names)
doc = nlp("I was reading the paper.")
token = doc[0]  # 'I'
print(token.morph)  # 'Case=Nom|Number=Sing|Person=1|PronType=Prs'
print(token.morph.get("PronType"))  # ['Prs']

Pipeline: ['tok2vec', 'parser', 'lemmatizer']

[]


# Constituency

# Collocation

# Word vectors

# Dependency

# Named entities

Für einzelne Tokens: [Matcher](https://spacy.io/usage/rule-based-matching) \
Für ganze Sätze: [Phrasematcher](https://spacy.io/usage/rule-based-matching#phrasematcher) \
Ich schätze für den Moment sind wir nur an einzelnen Tokens interessiert? Oder an allen einzigartigen Token im ganzen Text?

In [17]:
from spacy.matcher import Matcher
from collections import defaultdict # 

# this works on the same example as above

nlp = spacy.load("en_core_web_sm")
# initialize the matcher, the vocab has to be the same as for the text
matcher = Matcher(nlp.vocab)

# idealy the user specifies what he wants to search and what attribute to assign?
terms = [[{"LOWER":"audi"}], [{"LOWER":"improvements"}], [{"LOWER":"parking"}]]

# also supports regular expressions:
terms += [[{"TEXT":{"REGEX":"^[Ii](\.?|f)$"}}]] # search for I, i, If, if

print("Query: {}".format(terms))

# add the terms to look for to the mathcer
matcher.add("Query", terms)

# load the data into doc
doc = nlp(data)
# run the matcher on the text in doc
matches = matcher(doc)

# get the indices (would correspond to corpus position from cwb?)
indices = [[start,end] for _,start,end in matches]

print(indices)

dict_out = defaultdict(list) # default dict initializes the value of a new key that is added with an empty list
                             # which we can then append to

# put the found indices to access the searched terms in a dictionary where they are available via said terms
# We have to go through all the found entities to confirm what term they correspond to...
# For large texts where we have many hits faster to split the query beforehand? Maybe parallel searching?
for index_pair in indices:
    dict_out["{}".format(doc[index_pair[0]:index_pair[1]])].append(index_pair)

# display the output
for key in dict_out:
    print("{} found at location {}.".format(key, dict_out[key]))

#for index in indices:
#    print('{} found at index location {}'.format(doc[index], index))

#print(matches)

#for match_id, start, end in matches:
#    string_id = nlp.vocab.strings[match_id] #Get string representation
#    span = doc[start:end] # matched span
#    print(match_id, string_id, start, end, span.text)

Query: [[{'LOWER': 'audi'}], [{'LOWER': 'improvements'}], [{'LOWER': 'parking'}], [{'TEXT': {'REGEX': '^[Ii](\\.?|f)$'}}]]
[[14, 15], [54, 55], [55, 56], [60, 61], [64, 65], [91, 92], [97, 98], [124, 125], [129, 130], [160, 161], [164, 165], [169, 170], [180, 181], [192, 193], [209, 210], [211, 212], [223, 224], [232, 233], [255, 256], [266, 267], [311, 312], [312, 313], [349, 350], [359, 360], [391, 392], [399, 400], [454, 455], [460, 461], [467, 468], [472, 473], [478, 479], [497, 498], [515, 516], [525, 526], [530, 531], [544, 545], [564, 565], [576, 577], [595, 596], [617, 618], [629, 630], [656, 657], [690, 691], [713, 714], [754, 755], [771, 772], [788, 789], [803, 804], [834, 835], [879, 880], [886, 887], [911, 912], [952, 953], [960, 961], [968, 969]]
Audi found at location [[14, 15], [54, 55], [55, 56], [60, 61], [64, 65], [91, 92], [97, 98], [124, 125], [129, 130], [160, 161], [164, 165], [169, 170], [180, 181], [192, 193], [211, 212], [223, 224], [232, 233], [266, 267], [312

In [18]:
# can also search for words of certain length or above/below certain lengths
matcher = Matcher(nlp.vocab)

pattern = [[{"LENGTH":{"==":10}}]]#, [{"LENGTH":{"<=":1}}], [{"LENGTH":{">=":12}}]]

matcher.add("Query", pattern)
matches = matcher(doc)

indices = [[start,end] for _,start,end in matches]

dict_out = defaultdict(list)

for index_pair in indices:
    dict_out["{}".format(doc[index_pair[0]:index_pair[1]])].append(index_pair)

for key in dict_out:
    print("{} found at location {}.".format(key, dict_out[key]))

changeover found at location [[76, 77]].
intervenes found at location [[144, 145]].
innovative found at location [[202, 203]].
suppressed found at location [[281, 282], [304, 305]].
technology found at location [[420, 421], [573, 574]].
determines found at location [[547, 548]].
ultrasonic found at location [[554, 555]].
pedestrian found at location [[666, 667]].
highlights found at location [[700, 701]].
likelihood found at location [[717, 718]].
contribute found at location [[736, 737]].
supporting found at location [[739, 740]].
windscreen found at location [[859, 860]].
programmes found at location [[945, 946]].


In [19]:
# search for token pattern

matcher = Matcher(nlp.vocab)

# seach for the different types of cars with a "wildcard token pattern" leaving the last token empty
pattern = [[{"ORTH":"Audi"}, {"ORTH": "A"}, {}]] 

matcher.add("Query", pattern)

matches = matcher(doc)

indices = [[start,end] for _,start,end in matches]

for _, start, end in matches:
    indices.append([start, end])

dict_out = defaultdict(list)

for index_pair in indices:
    dict_out["{}".format(doc[index_pair[0]:index_pair[1]])].append(index_pair)

for key in dict_out:
    print("{} found at location {}.".format(key, dict_out[key]))

Audi A eight found at location [[14, 17], [97, 100], [124, 127], [180, 183], [232, 235], [472, 475], [629, 632], [690, 693], [771, 774], [788, 791], [14, 17], [97, 100], [124, 127], [180, 183], [232, 235], [472, 475], [629, 632], [690, 693], [771, 774], [788, 791]].
Audi A six found at location [[160, 163], [192, 195], [497, 500], [879, 882], [160, 163], [192, 195], [497, 500], [879, 882]].
Audi A seven found at location [[164, 167], [911, 914], [164, 167], [911, 914]].


In [46]:
# put it in a function

def search_text(query, nlp, doc):
    matcher = Matcher(nlp.vocab)

    matcher.add("Query", query)

    matches = matcher(doc)

    indices = [[start,end] for _,start,end in matches]

    dict_out = defaultdict(list)

    for index_pair in indices:
        dict_out["{}".format(doc[index_pair[0]:index_pair[1]])].append(index_pair)

    #for key in dict_out:
    #    print("{} found at location {}.".format(key, dict_out[key]))
    return dict_out


In [48]:
query = [[{"ORTH":"Audi"}, {"ORTH": "A"}, {}]] 

test1 = search_text(query, nlp, doc)

print(test1)

print('*'*50)

# can just add different queries together
query += terms

test2 = search_text(query, nlp, doc)

print(test2)

defaultdict(<class 'list'>, {'Audi A eight': [[14, 17], [97, 100], [124, 127], [180, 183], [232, 235], [472, 475], [629, 632], [690, 693], [771, 774], [788, 791]], 'Audi A six': [[160, 163], [192, 195], [497, 500], [879, 882]], 'Audi A seven': [[164, 167], [911, 914]]})
**************************************************
defaultdict(<class 'list'>, {'Audi': [[14, 15], [54, 55], [55, 56], [60, 61], [64, 65], [91, 92], [97, 98], [124, 125], [129, 130], [160, 161], [164, 165], [169, 170], [180, 181], [192, 193], [211, 212], [223, 224], [232, 233], [266, 267], [312, 313], [399, 400], [467, 468], [472, 473], [497, 498], [544, 545], [576, 577], [595, 596], [617, 618], [629, 630], [690, 691], [754, 755], [771, 772], [788, 789], [803, 804], [834, 835], [879, 880], [911, 912], [960, 961], [968, 969]], 'Audi A eight': [[14, 17], [97, 100], [124, 127], [180, 183], [232, 235], [472, 475], [629, 632], [690, 693], [771, 774], [788, 791]], 'Audi A six': [[160, 163], [192, 195], [497, 500], [879, 882]]

In [28]:
def get_unique(doc):
    """Get number of unique words in doc"""
    
    out = 0
    seen = set()
    for token in doc:
        if token.text not in seen:
            out+=1
        seen.add(token.text)
    return out

In [29]:
def search_token(query, nlp, doc):
    """search text for specific token and return all the found locations in dict."""

    matcher = Matcher(nlp.vocab)

    matcher.add("Query", query)

    matches = matcher(doc)

    indices = [[start,end] for _,start,end in matches]

    dict_out = defaultdict(list)

    dict_out["{}".format(doc[indices[0][0]:indices[0][1]])].append(indices)

    #for key in dict_out:
    #    print("{} found at location {}.".format(key, dict_out[key]))
    return dict_out


In [43]:
def named_entities(nlp, doc):
    """Get locations for all umique words in doc into a dictionary, case sensitive"""
    
    named_entities = None

    # get the number of unique tokens in text, may save some time in some cases
    # still worth it for large corpora?
    unique = get_unique(doc)

    for token in doc:
        
        if named_entities:
            # if the dictionary is not empty:
            if token.text in named_entities:
                # if the token is already in the dictionary:
                pass
            elif token.text not in named_entities:
                # if the token is not already in there add it:
                named_entities.update(search_token([[{"ORTH":"{}".format(token)}]], nlp, doc))
        else:
            # if the dictionary hasn't been initialized do so with first token
            named_entities = search_token([[{"ORTH":"{}".format(token)}]], nlp, doc)
        
        if len(named_entities) == unique:
            # if we have passed each unique word already there is no need to continue
            break

    return named_entities

In [44]:
named_ent = named_entities(nlp, doc)

In [45]:
for key in named_ent:
    print("{}: {}".format(key, named_ent[key]))

As: [[[0, 1], [369, 370], [727, 728]]]
part: [[[1, 2]]]
of: [[[2, 3], [9, 10], [20, 21], [35, 36], [275, 276], [289, 290], [342, 343], [423, 424], [466, 467], [487, 488], [537, 538], [550, 551], [560, 561], [662, 663], [718, 719], [829, 830]]]
the: [[[3, 4], [7, 8], [13, 14], [33, 34], [48, 49], [70, 71], [96, 97], [105, 106], [117, 118], [123, 124], [140, 141], [146, 147], [159, 160], [179, 180], [188, 189], [191, 192], [215, 216], [231, 232], [247, 248], [256, 257], [273, 274], [287, 288], [295, 296], [320, 321], [329, 330], [354, 355], [365, 366], [373, 374], [382, 383], [419, 420], [424, 425], [445, 446], [457, 458], [464, 465], [471, 472], [476, 477], [496, 497], [517, 518], [535, 536], [538, 539], [548, 549], [551, 552], [558, 559], [561, 562], [572, 573], [614, 615], [628, 629], [657, 658], [668, 669], [680, 681], [684, 685], [689, 690], [703, 704], [711, 712], [731, 732], [740, 741], [787, 788], [797, 798], [802, 803], [818, 819], [849, 850], [858, 859], [865, 866], [874, 875],

In [37]:
print(len(named_ent))

313
