###annotation name="mb"  - morpheme breaks

###annotation name="ps"  - part of speech

###annotation name="gr"  - gloss (russian)

In [1]:
pip install wget

Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9657 sha256=75920f474590d9a1d2dd560c00efdd9e390992cc3a4d4769a361abe4a105bf68
  Stored in directory: /root/.cache/pip/wheels/8b/f1/7f/5c94f0a7a505ca1c81cd1d9208ae2064675d97582078e6c769
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [2]:
from bs4 import BeautifulSoup
from zipfile import ZipFile
import pandas as pd
import wget
import re

In [3]:
def process_bases(full_morph_bases):
  '''
    берём первую часть эвенкийского слова (до дефиса)
    если сразу за первой частью идёт суффикс == r
  '''
  good_bases = []

  for base in full_morph_bases:
    base_parts = re.split('-', base)

    if len(base_parts) > 1:
      if base_parts[1] == 'r':
        good_base = base_parts[0] + 'n'
      else:
        good_base = base_parts[0]

    else:
      good_base = base_parts[0]

    good_bases.append(good_base)

  return good_bases

In [4]:
def degloss_ru_roots(full_ru_glosses):
  '''
    берём русские глоссы и достаём оттуда последовательсноти буквы или NEG
    если ничего не находится, пишем None, чтобы потом, при обработке готовой pandas-таблицы, удалить
    если в глоссах больше одного корня, соединяем их точкой (верхняя.одежда)
  '''
  ru_roots = []

  for gr in full_ru_glosses:
    good_gr = re.findall('[а-яё]+', gr)

    if len(good_gr) < 1:
      good_gr = re.findall(r'NEG|\d', gr)
      if len(good_gr) < 1:
        good_gr = None
      elif len(good_gr) > 1:
        good_gr = '.'.join(good_gr)
      else:
        good_gr = good_gr[0]

    elif len(good_gr) > 1:
      good_gr = '.'.join(good_gr)

    else:
      good_gr = good_gr[0]

    ru_roots.append(good_gr)

  return ru_roots

In [5]:
def process_soup(soup):
  '''
    вытаскиваем из документов три штуки:
    1. эвенкийские слова, разделенные на морфемы (full_morph_bases). позже обрабатываем их в функции process_bases
    2. части речи (pos_s)
    3. русские глоссы (full_ru_glosses). позже обрабатываем их в функции degloss_ru_roots
  '''

  full_morph_bases = []
  pos_s = []
  full_ru_glosses = []

  for i in soup.find_all('annotation'):
    triple = []
    if 'name="mb"' in str(i):
      for j in i.find_all('ta'):
        full_morph_bases.append(j.string)

    if 'name="ps"' in str(i):
      for j in i.find_all('ta'):
        pos_s.append(j.string)

    if 'name="gr"' in str(i):
      for j in i.find_all('ta'):
        full_ru_glosses.append(j.string)

  return full_morph_bases, pos_s, full_ru_glosses

In [6]:
def the_extraction(filename):
  '''
    берём файл, читаем его bautifulsoup_ом, создаем pandas-таблицу
    degloss_ru_roots вытаскивает русские корни из глосс
    process_bases вытаскивает эвенкиские основы
    складываем всё в pandas-таблицу
  '''
  with open(filename) as file:
    soup = BeautifulSoup(file)

  full_morph_bases, pos_s, full_ru_glosses = process_soup(soup)
  pd_triples = pd.DataFrame({'morph_base':[], 'part_of_speech':[], 'russian_root':[]})

  ru_glosses = degloss_ru_roots(full_ru_glosses)
  morph_bases = process_bases(full_morph_bases)

  for i in range(len(full_morph_bases)):
    pd_triples.loc[len(pd_triples.index)] = [morph_bases[i], pos_s[i], ru_glosses[i]]

  return pd_triples

In [7]:
def process_dataframe(df):
  '''
    убираем дубликаты и строки с пустотами
  '''
  df = df.drop_duplicates()
  df = df.dropna()
  return df

In [15]:
path = 'https://www.fdr.uni-hamburg.de/record/9628/files/evenki-1.0-noaudio.zip'

In [10]:
files = wget.download(path, out='files')

In [None]:
exs_filenames = []
big_df = pd.DataFrame({'morph_base':[], 'part_of_speech':[], 'russian_root':[]})
problems = []
with ZipFile('files', 'r') as myzip:
  for filename in myzip.namelist():
    if filename.endswith('.exs'):
      exs_filenames.append(filename)
  for i in range(len(exs_filenames)):
    try:
      myfile = myzip.extract(exs_filenames[i])
      big_df = pd.concat([big_df, the_extraction(myfile)], ignore_index = True)
    except Exception as error:
      problems.append((myfile, error))

In [12]:
problems

[('/content/flk/KS_1930_Ichegdyro_flk/KS_1930_Ichegdyro_flk_s.exs',
  IndexError('list index out of range')),
 ('/content/flk/YUK_2007_FoxBearSquirrel_flk/YUK_2007_FoxBearSquirrel_flk_s.exs',
  IndexError('list index out of range')),
 ('/content/nar/BTV_20190819_Father_nar/BTV_20190819_Father_nar_s.exs',
  IndexError('list index out of range')),
 ('/content/nar/NNR3_1913_AboutShaman_nar/NNR3_1913_AboutShaman_nar_s.exs',
  IndexError('list index out of range'))]

In [13]:
process_dataframe(big_df)

Unnamed: 0,morph_base,part_of_speech,russian_root
0,kak,interrog,как
1,dikti,n,чудо
2,aŋ,ptcp,этовать
3,ŋənə,ptcp,идти
4,Čulugdi,nprop,злой.дух
...,...,...,...
45972,ji,interj,йи
45973,eːwa,v,что
45983,kaladopka,n,кладовка
45986,jo,interj,йо


In [14]:
big_df.describe()

Unnamed: 0,morph_base,part_of_speech,russian_root
count,45992,45992,45662
unique,5939,27,2577
top,bi,n,тот
freq,1569,14554,2061
