-
Notifications
You must be signed in to change notification settings - Fork 1
/
prepositional_phrases_callable.py
159 lines (136 loc) · 7.85 KB
/
prepositional_phrases_callable.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#! usr/bin/env python3
"""
Searches the corpus for prepositional phrases and stores them
"""
import regex as re
from termcolor import colored
class PrepositionalPhrase:
def __init__(self, sentence):
self.sentence = sentence
preps = ['abutan', 'abuten', 'abuton', 'æfter', 'æthindan',
'amang', 'anforngean', 'begeonde', 'beheonan', 'beheonon',
'behindan', 'behwon', 'beneoðan', 'betweoh', 'betweohs',
'betwux', 'betweonan', 'betweonum', 'betweox', 'betweoxn',
'betwih', 'betwisc', 'bewestan', 'binnan', 'butan', 'æt',
'foranto', 'in', 'innan', 'into', 'mid', 'of', 'ofer', 'on',
'onforan', 'oninnan', 'onmiddan', 'onufan', 'onuppan', 'uppon', 'uppan',
'from', 'fram', 'to', 'toeacan', 'toforan', 'toforen', 'toforon', 'towiðere',
'towiðre', 'ðurhut', 'þurh', 'ðurh', 'wið', 'wiðer', 'wiðforan',
'wiðgeondan', 'wiðutan', 'ymbe', 'ymbutan'
]
datacc_pronouns = ['me', 'unc', 'us', 'þe', 'ðe', 'inc', 'eow', 'hine', 'him', 'hit', 'hie', 'hiere', 'heo']
datacc_demonstratives = ['þisne', 'ðisne', 'þissum', 'ðissum', 'þis',
'ðis', 'þas', 'ðas', 'þisse', 'ðisse', 'þas', 'ðas',
'þone', 'ðone', 'þæm', 'þam', 'ðæm', 'ðam', 'þæt', 'þat',
'ðæt', 'ðat', 'þa', 'ða', 'þære', 'þare', 'ðære', 'ðare']
conjunctions = ['ond', 'and', 'oððe', 'oþþe', 'ac']
noun_strong = re.compile(r'[^r]e$|as$|u$|[^r]a$|um$|[bcdfghlmnprstw]$')
noun_weak = re.compile(r'e$|an$|um$')
adj_strong = re.compile(r'ne$|um$|e$|re$|ra$|u$|[^r]a$')
adj_weak = re.compile(r'e$|an$|um$')
genitives = ['min', 'uncer', 'ure', 'his', 'hiere', 'þin', 'ðin', 'þinre', 'ðinre', 'incer', 'eower', 'hiera',
'ðæs',
'þæs', 'ðære', 'þære', 'ðisses', 'þisses', 'ðisse', 'ðisse', 'ðara', 'þara', 'ðissa', 'þissa']
genitive_pattern = re.compile(r'es$')
def maybesecondword(self, test2):
if test2 in datacc_demonstratives:
return True
if test2 in adj_strong:
return True
if test2 in adj_weak:
return True
if test2 in genitives:
return True
if test2 in genitive_pattern:
return True
def isitanobject(self, test):
if test in datacc_pronouns:
return True
if test in noun_strong:
return True
right_edge = False
words = sentence.split()
length = len(words)
for index, word in enumerate(words):
word = re.sub(r'\p{P}', '', word)
if word in preps: # found preposition
print('\n', colored(word, 'red'), end=' ')
this_pp = word
if index + 1 == length: # then PRP is the last word
break
if index + 1 < length: # We're OK, so check 2nd word for POS
# print(words[index+1], end=' ')
if words[index + 1] in datacc_pronouns: # stop if 2nd word is dat or acc prn
print(colored(words[index + 1], 'red'), end=' ')
continue
if index + 2 < length: # if 2nd word is another PRP, then combine with first PRP
if words[index + 2] in preps:
print(colored(words[index + 1], 'red'), end='')
print(colored(words[index + 2], 'red'))
if index + 3 < length:
pass
continue
if index + 2 < length: # if a CNJ, check 4th
if words[index + 2] in conjunctions:
if re.search(noun_strong, words[index + 2]): # check if following word is d/a noun
print(colored(words[index + 1], 'red'), end=' ') # print the CNJ
print(colored(words[index + 2], 'red'), end=' ')
continue
if words[index + 1] in datacc_demonstratives: # if next is article, check 3rd word
print(colored(words[index + 1], 'yellow'), end=' ')
if index + 2 < length: # ensure there is a 3rd word
if re.search(noun_strong, words[index + 2]):
right_edge = True
print(colored(words[index + 2], 'yellow'))
continue
elif re.search(noun_weak, words[index + 2]):
right_edge = True
print(colored(words[index + 2], 'yellow'))
continue
else:
continue
if re.search(adj_weak, words[index + 1]): # if next is wk adj, check 3rd for N
print(colored(words[index + 1], 'green'), end=' ')
if index + 2 < length: # ensure there is a 3rd word
if re.search(noun_weak, words[index + 2]):
print(colored(words[index + 2], 'green'))
right_edge = True
continue
else:
right_edge = True
continue
else:
continue
if re.search(adj_strong, words[index + 1]): # if next is str adj, check 3rd for str N
print(colored(words[index + 1], 'yellow'), end=' ')
if index + 2 < length: # ensure there is a 3rd word
if re.findall(noun_strong, words[index + 2]):
print(colored(words[index + 2], 'yellow'))
right_edge = True
continue
else:
right_edge = True
continue
if words[index + 1] in genitives: # if next is genitive, check 3rd for N
print(colored(words[index + 1], 'magenta'), end=' ')
if index + 2 < length: # ensure there is a 3rd word
if re.search(noun_strong, words[index + 2]):
print(colored(words[index + 2], 'magenta'))
right_edge = True
continue
elif re.search(noun_weak, words[index + 2]):
print(colored(words[index + 2], 'magenta'))
right_edge = True
continue
elif re.search(genitive_pattern, words[index + 1]):
print(colored(words[index + 1], 'magenta'), end=' ')
if index + 2 < length:
print(colored(words[index + 2], 'magenta'))
continue
elif re.search(r'[^e$|um$|an$|ne$|as$|a$]', words[index+1]): # if next not inflected, check 3rd
print(colored(words[index + 1], 'magenta'), end=' ')
if index + 2 < length: # ensure there is a 3rd word
if re.search(noun_strong, words[index + 2]):
print(colored(words[index + 2], 'magenta'))
continue
# TODO: add support for conjunctions (ond, oþþe, ac, etc.)