""" Script for extracting all ADP tokens from UD-format corpus.
    Here, only token-based list is saved due to inconsistent lemmatization
    of prepositional variants such as 'в-во' which makes lemma list meaningless.
"""

In [1]:
import pandas as pd

### Opening & parsing data

In [2]:
data = pd.read_csv('ru_syntagrus-ud-test.conllu', sep = "\t", comment = "#",
                   skip_blank_lines = False, usecols = [0, 1, 2, 3, 5, 6, 7, 8],
                   names = ['token_id', 'token', 'lemma', 'POS', 'features', 'head', 'deprel', 'deps'],
                   dtype = {'token_id' : str})

data.insert(column = 'sent_id', loc = 0, value = None)

sent_id = 0
for idx in data.index:
    if not pd.isna(data.at[idx, 'token_id']):
        data.at[idx, 'sent_id'] = sent_id
    else:
        sent_id += 1
        
data.dropna(0, thresh = 3, inplace = True)

In [3]:
data

Unnamed: 0,sent_id,token_id,token,lemma,POS,features,head,deprel,deps
0,0,1,В,в,ADP,_,3,case,3:case
1,0,2,советский,советский,ADJ,Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|N...,3,amod,3:amod
2,0,3,период,период,NOUN,Animacy=Inan|Case=Acc|Gender=Masc|Number=Sing,11,obl,11:obl
3,0,4,времени,время,NOUN,Animacy=Inan|Case=Gen|Gender=Neut|Number=Sing,3,nmod,3:nmod
4,0,5,число,число,NOUN,Animacy=Inan|Case=Acc|Gender=Neut|Number=Sing,11,obj,11:obj
...,...,...,...,...,...,...,...,...,...
124008,6490,21,революцию,революция,NOUN,Animacy=Inan|Case=Acc|Gender=Fem|Number=Sing,15,obj,15:obj
124009,6490,22,\t,PUNCT,_,21,punct,21:punct,_
124010,6490,23,1933,1933,NUM,_,24,nummod,24:nummod
124011,6490,24,года,год,NOUN,Animacy=Inan|Case=Gen|Gender=Masc|Number=Sing,21,nmod,21:nmod


In [4]:
ADP_lemma_list = data.loc[data['POS'] == 'ADP']['lemma'].unique().tolist()

ADP_tokens = data.loc[data['POS'] == 'ADP']['token'].unique().tolist()
ADP_token_list = sorted(set([x.lower() for x in ADP_tokens]))

In [5]:
with open('ADP_token_list.txt', 'w', encoding = 'utf8') as f:
    f.write('\n'.join(ADP_token_list))

In [6]:
# with open('ADP_lemma_list.txt', 'w', encoding = 'utf8') as f:
#     f.write('\n'.join(ADP_lemma_list))

In [7]:
# example of inconsistency with lemmas:
# data.loc[data['token'] == 'во']

Unnamed: 0,sent_id,token_id,token,lemma,POS,features,head,deprel,deps
106,5,2,во,во,ADP,_,1,case,1:case
850,44,6,во,в,ADP,_,9,case,9:case
882,46,10,во,в,ADP,_,12,case,12:case
1288,68,8,во,в,ADP,_,9,case,9:case
2015,104,15,во,в,ADP,_,17,case,17:case
...,...,...,...,...,...,...,...,...,...
117068,6141,17,во,во,ADP,_,19,case,19:case
118633,6232,20,во,во,ADP,_,22,case,22:case
121025,6345,12,во,во,ADP,_,18,case,18:case
121459,6362,14,во,во,ADP,_,13,case,13:case
