""" Script for extracting all prepositional phrases from SynTagRus, test subcorpus 
    (https://github.com/UniversalDependencies/UD_Russian-SynTagRus).
    Made to match pphrase (https://github.com/merionum/pphrase) output.
"""

### Setting up Token class for easier data parsing

In [1]:
class Token:
    
    #all_tokens = []
    
    def __init__(self, token_id=None,
                 token=None, lemma=None,
                 POS=None, empty=None,
                 features=None, head=None,
                 deprel=None, deps=None, comment=None):
        
        self.token_id = token_id
        self.token = token.lower()
        self.lemma = lemma
        self.POS = POS
        self.empty = empty
        self.features = features
        self.head = head
        self.deprel = deprel
        self.deps = deps
        self.comment = comment
        
        #self.__class__.all_tokens.append(self)

### Parsing corpus data

In [2]:
corpus = []
sent = []

filename = 'ru_syntagrus-ud-test.conllu'

with open(filename, 'r', encoding = 'utf8') as f:
    for line in f:
        if line == '\n':
            if list(filter(lambda x: x.POS == 'ADP', sent)):
                corpus.append(tuple(sent)) # only load sentences with adpositions
            sent = []
        elif line.startswith('#'):
            pass
        else:
            sent_data = line.strip('\n').split('\t')
            tokenobj = Token(*sent_data)
            sent.append(tokenobj)

### Extracting prepositional phrases of various structural types

In [3]:
prep_constructions = []

for sent_tuple in corpus:
    for item in sent_tuple:
        
        # check for simple preps & multiwords starting w/prep:
        if (item.POS == 'ADP') & (item.deprel == 'case'):
            head_id = item.token_id # prep id
            head_token = item.token # prep token        
            dep_id = item.head # dependant id
            dep_item = next((x for x in sent_tuple if x.token_id == dep_id))
            if dep_item.deprel == 'root':
                continue
            dep_token = dep_item.token # dependant token        
            host_id = dep_item.head # host id
            host_item = next((x for x in sent_tuple if x.token_id == host_id))
            host_token = host_item.token # host token

            # check if prep is part of complex prep + form full prep:
            is_multiword = [x for x in sent_tuple if (x.head == head_id) 
                                                    & (x.deprel == 'fixed')]
            prep_parts = ' '.join([x.token for x in is_multiword])
            if prep_parts:
                prep_status = 'complex'
                full_prep = ' '.join([head_token, prep_parts])
            else:
                prep_status = 'simple'
                full_prep = head_token

        # check for multiwords ending w/prep:
        elif (item.POS == 'ADP') & (item.deprel == 'fixed'):
            prep_token = item.token # prep token
            head_id = item.head # multiword head id
            head_item = next((x for x in sent_tuple if x.token_id == head_id))
            head_token = head_item.token            
            if head_token in ['несмотря', 'невзирая', 'вплоть', 'наравне']:
                prep_status = 'complex'
                full_prep = ' '.join([head_token, prep_token])
                dep_id = head_item.head # dependant id
                dep_item = next((x for x in sent_tuple if x.token_id == dep_id))
                if dep_item.deprel == 'root':
                    continue
                dep_token = dep_item.token # dependant token        
                host_id = dep_item.head # host id
                host_item = next((x for x in sent_tuple if x.token_id == host_id))
                host_token = host_item.token #host token
            else:
                continue

        else:
            continue

        # restore original order within PP:
        phrase = []
        phrase_ids = []
        phrase_ids = [int(head_id), int(dep_id), int(host_id)]
        phrase_ids.sort()

        # map ids back to tokens:
        if prep_status == 'simple':
            for elem in phrase_ids:
                token = [x.token for x in sent_tuple if x.token_id == str(elem)]
                phrase += token

        else:
            for elem in phrase_ids:
                token_item = next((x for x in sent_tuple if x.token_id == str(elem)))
                if token_item.deprel == 'case': # if token is multiword head
                    phrase.append(full_prep) # replace multiword head with full prep
                else: # if token is not head
                    phrase.append(token_item.token)

        # convert ordered list constituents to pphrase format dict:   
        phrase_str = ' '.join(phrase)
        phrase_dict = {x: v for x, v in zip(["phrase", "host", "prep", "dependant"],
                                            [phrase_str, host_token, full_prep, dep_token])}

        #add PP to full list of PPs in corpus:
        prep_constructions.append(phrase_dict)

### Writing result to file

In [5]:
with open('PPs_extracted.txt', 'w+', encoding = 'utf8') as g:
    g.write(str(prep_constructions))