-
Notifications
You must be signed in to change notification settings - Fork 1
/
prepositional_phrases.py
167 lines (139 loc) · 7.1 KB
/
prepositional_phrases.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
#! usr/bin/env python3
"""
Identifies prepositional phrases in a sentence.
All prepositional phrases (PP) are composed of PRP + NP
where NP is optional [DEM | ADJ | DEM ADJ] and N
"""
import regex as re
from termcolor import colored
import get_text
# _____________________________________________ declarations
text = get_text.random_sentence('poetry') # gets a random sentence
# text = """x"""
words = text.split()
print(colored(text, 'blue'))
length = len(words)
print('\nNumber of words: ', length, '\n')
preps = ['abutan', 'abuten', 'abuton', 'æfter', 'æthindan',
'amang', 'anforngean', 'begeonde', 'beheonan', 'beheonon',
'behindan', 'behwon', 'beneoðan', 'betweoh', 'betweohs',
'betwux', 'betweonan', 'betweonum', 'betweox', 'betweoxn',
'betwih', 'betwisc', 'bewestan', 'binnan', 'butan', 'æt', 'for',
'foranto', 'in', 'innan', 'into', 'mid', 'of', 'ofer', 'on',
'onforan', 'oninnan', 'onmiddan', 'onufan', 'onuppan', 'uppon', 'uppan',
'from', 'fram', 'to', 'toeacan', 'toforan', 'toforen', 'toforon', 'towiðere',
'towiðre', 'ðurhut', 'þurh', 'ðurh', 'wið', 'wiðer', 'wiðforan',
'wiðgeondan', 'wiðutan', 'ymb', 'ymbe', 'ymbutan'
]
pronouns_nominative = ['ic', 'we', 'ðu', 'þu', 'eow', 'he', 'seo', 'hit', 'hi']
datacc_pronouns = ['me', 'unc', 'us', 'þe', 'ðe', 'inc', 'eow', 'hine', 'him', 'hit', 'hie', 'hy',
'hyra', 'hiere', 'heo']
datacc_demonstratives = ['þisne', 'ðisne', 'þissum', 'ðissum', 'þis',
'ðis', 'þas', 'ðas', 'þisse', 'ðisse', 'þas', 'ðas', 'þæs', 'ðæs',
'þone', 'ðone', 'þæm', 'þam', 'ðæm', 'ðam', 'þæt', 'þat',
'ðæt', 'ðat', 'þa', 'ða', 'þære', 'þare', 'ðære', 'ðare']
pro_adjs = ['oðre', 'oþre', 'oðrum', 'oþrum', 'oðra', 'oþra', 'eall', 'eallum', 'ealle', 'ealla',
'ealles', 'sum', 'sume', 'an', 'anne', 'anre']
conjunctions = ['ond', 'and', 'oððe', 'oþþe', 'ac']
noun_strong = re.compile(r'[^r]e$|as$|u$|[^r]a$|um$|æ$')
noun_weak = re.compile(r'an$')
adj_strong = re.compile(r'ne$|um$|e$|re$|ra$|u$|[^r]a$')
adj_weak = re.compile(r'e$|an$|um$')
genitives = ['min', 'minre', 'uncer', 'ure', 'his', 'hiere', 'þin', 'ðin', 'þinre', 'ðinre', 'incer', 'eower', 'hiera',
'ðæs', 'þæs', 'ðære', 'þære', 'ðisses', 'þisses', 'ðisse', 'ðisse', 'ðara', 'þara', 'ðissa', 'þissa']
genitive_pattern = re.compile(r'es$')
# ________________________________________________ defs
# define likely objects, but double-check them in the if statements
def maybe_theobject(test):
if test in datacc_pronouns:
return True
if test in pronouns_nominative:
return False
if re.search(noun_strong, test):
return True
if re.search(noun_weak, test):
return True
def check_next(test2):
if test2 in datacc_pronouns:
return True
if test2 in pronouns_nominative:
return False
if test2 in pro_adjs:
return True
if re.search(noun_strong, test2):
return True
if re.search(noun_weak, test2):
return True
if re.search(adj_strong, test2):
return True
if re.search(adj_weak, test2):
return True
def check_genitive(test3):
if test3 in genitives:
return True
if re.search(genitive_pattern, test3):
return True
for index, word in enumerate(words):
word = re.sub(r'\p{P}', '', word) # clean out punctuation
if word in preps: # found preposition, let's do this thing
print('\n', colored(word, 'red'), end=' ')
this_pp = word
# special cases
if index == length: # then PRP is the last word, so break
break
if index + 1 < length: # check for idiomatic forþamþe
if word == 'for' and re.search(r'(ð|þ)a(m|n)', words[index + 1]):
print(colored(words[index + 1], 'red'), end=' ')
if index + 2 < length:
if words[index + 2] == 'ðe' or words[index + 2] == 'þe':
print(colored(words[index + 2], 'red'), end=' ')
continue
if re.search(r'n(e|y|i)sse$', words[index +1]): # check for words ending in -nysse
print(colored(words[index + 1], 'red'), end=' ')
continue
if index + 2 < length: # check for reflexive pronoun
if re.search(r'selfe', words[index + 2]):
print(colored(words[index + 2], 'red'), end=' ')
# look right of the PRP
if index + 1 < length: # There is another word, so check PP-2nd word for POS
if words[index + 1] in datacc_pronouns: # a dat/acc pronoun always terminates a PP
print(colored(words[index + 1], 'red'))
continue
if words[index + 1] in datacc_demonstratives: # if PP-2nd is a demonstrative, check 3rd word etc
print(colored(words[index + 1], 'yellow'), end=' ')
if index + 2 < length: # ensure there is a 3rd word
if maybe_theobject(words[index + 2]):
print(colored(words[index + 2], 'yellow'), end=' ')
if index + 3 < length and check_next(words[index + 3]):
print(colored(words[index + 3], 'yellow'), end=' ')
continue
elif check_next(words[index + 2]):
print(colored(words[index + 2], 'yellow'))
continue
else:
continue
if maybe_theobject(words[index + 1]): # if 2nd word is the object of the preposition, check 3rd
print(colored(words[index + 1], 'cyan'), end=' ')
if index + 2 < length and check_next(words[index + 2]):
print(colored(words[index + 2], 'cyan'))
continue
continue
if check_next(words[index + 1]): # if 2nd word is not object, but something else, check 3rd
print(colored(words[index + 1], 'magenta'), end=' ')
if index + 2 < length and check_next(words[index + 2]):
print(colored(words[index + 2], 'magenta'))
continue
continue
if check_genitive(words[index + 1]): # if 2nd word genitive
print(colored(words[index + 1], 'green'), end=' ')
if index + 2 < length and check_next(words[index + 2]):
print(colored(words[index + 2], 'green'))
continue
if re.search(adj_weak, words[index + 1]): # if 2nd word weak adj, see if PP-3rd word is too
print(colored(words[index + 1], 'green'), end=' ')
if index + 2 < length and re.search(adj_weak, words[index + 2]):
print(colored(words[index + 2], 'green'))
continue
if index + 1 < length: # if all else fails, the 2nd word is likely the object
print(colored(words[index + 1], 'grey'), end=' ')
# TODO: Make this into a callable module