# Setup

## Libraries

In [1]:
import re
import xmltodict
import pandas as pd
import os
import requests
import bz2

In [2]:
import logging
logging.basicConfig( format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger()

## Conf

In [3]:
extract_fields = {'sin': 'sinonimi',
                  'ant': 'contrari',
                  'rel': 'relazioni',
                  'verb form': 'verbi',
                  'der': 'derivati',
                  'alter': 'alterati',
                  'ipon': 'iponomi',
                  'iperon': 'iperonimi',
                  'var': 'varianti',
                  'noconf': 'nonconfondere',
                  'acron': 'acronimo',
                  'coni': 'coniugazione'
                 }
exclude_fields = ['raw', 'text', 'proc_text']
part_sentence = {'agg': 'aggettivo',
                 'agg form': 'aggettivo',
                 'sost': 'sostantivo',
                 'sost form': 'sostantivo',
                 'verb form': 'forma verbale',
                 'verb': 'verbo',
                 'avv': 'avverbio',
                 'nome': 'nome',
                 'loc nom': 'locuzione nominale',
                 'acron': 'acronimo',
                 'inter': 'interiezione',
                 'cong': 'congiunzione',
                 'prep': 'preposizione',
                 'art': 'articolo',
                 'pron': 'pronome',
                 'espr': 'espressione',
                 'loc inter': 'locuzione',
                 'agg num': 'aggettivo',
                 'loc cong': 'locuzione',
                 'loc avv': 'locuzione',
                 'log agg': 'locuzione',
                 'agg poss': 'aggettivo',
                 'pron poss': 'pronome',
                 'pron form': 'pronome',
                 'agg dim': 'aggettivo',
                 'pron dim': 'pronome',
                 'pref': 'prefisso',
                 'loc verb': 'locuzione',
                 'loc prep': 'locuzione',
                 'suff': 'suffisso',
                 'conf': 'confisso',
                 'prefissoide': 'prefissoide'
                }

## Functions

In [4]:
def create_dir(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
        logger.info("Created directory '{}'".format(directory))

In [5]:
def download_unzip_dataset(version="latest"):

    url = 'https://dumps.wikimedia.org/itwiktionary/' \
        '{version}/itwiktionary-{version}-pages-articles-multistream.xml.bz2'
    url = url.format(version=version)
    
    create_dir('data')
    compressed_fname = 'data/itwiktionary.xml.bz2'
    
    logger.info("Downloading from {}".format(url))
    r = requests.get(url, allow_redirects=True)

    logger.info("Saving to {}".format(compressed_fname))
    open(compressed_fname, 'wb').write(r.content)

    logger.info("Uncompressing")
    zip_file = bz2.BZ2File(compressed_fname)
    data = zip_file.read()
    out_fname = compressed_fname[:-4]
    open(out_fname, 'wb').write(data)
    
    logger.info("Output in {}".format(out_fname))

In [6]:
def read_xml(fname):
    with open(fname) as fd:
        wikitionary = xmltodict.parse(fd.read())
    
    logger.info("Read {} items from {}".format(len(wikitionary), fname))

    return wikitionary

In [7]:
def filter_pages(wikitionary):
    
    excluded_pages = ['Pagina principale']
    italian_str = "{{-it-}}"
    
    pages = [{'raw': x}
             for x in wikitionary.get('mediawiki').get('page')
             if (not re.match("\w*:.*", x['title']))
                and (x['title'] not in excluded_pages)
                and (italian_str in x.get('revision', {}).get('text', {}).get('#text', ''))
            ]

    logger.info("Kept {} out of {} pages".format(len(pages), len(wikitionary.get('mediawiki').get('page'))))
    
    return pages

In [8]:
def extract_title(doc):
    
    doc = doc.copy()
    doc['title'] = doc['raw']['title']
    doc['multiword'] = len(doc['title'].split(' ')) > 1
    
    return doc

In [9]:
def extract_text(doc):
    doc['text'] = doc['raw'].get('revision', {}).get('text', {}).get('#text', '')
    doc['text'] = re.sub('<[^<]+?>', '', doc['text'])
    
    return doc

In [10]:
def extract_vedi(doc):
    
    ret = []
    
    all_vedi = re.findall(r'{{Vedi\|\w*}}', doc['text'])
    for single in all_vedi:
        ret.append(single.replace('{{Vedi|', '').replace('}}', ''))
    
    if len(ret) > 0:
        doc['vedi'] = ret
        
    return doc

In [11]:
def preprocess_text(doc):

    text_info = re.sub(r'[A-Za-z0-9\s{}\|\n]*==\s*{{-it-}}\s*==\n', '', doc['text'])
    text_info = re.sub('==\s*{{-\w+-}}\s*==', 'miotestodasostituire', text_info)\
        .split('miotestodasostituire')[0]
    
    proc_text = dict()
    for section in text_info.split('\n\n'):
        if section.startswith('[[File:'):
            continue
        sec_title = re.search("{\s*-\s*(.+)\s*-", section.split('\n')[0])
        if sec_title:
            proc_text[sec_title.group(1)] = '\n'.join(section.split('\n')[1:])
    
    doc['proc_text'] = proc_text
    
    return doc

In [12]:
def extract_links(text):
    
    m = re.findall(r'\[\[(\w+)\]\]', text)
    if m:
        return m
    else:
        return None

In [13]:
def extract_fields_link(doc, ext_fields):
    
    for k, v in ext_fields.items():
        temp = extract_links(doc.get('proc_text', {}).get(k, ''))
        if temp:
            doc[v] = list(set(temp))
    
    return doc

In [14]:
def extract_pos(doc, fields):
    
    pos = set()
    
    for k in doc.get('proc_text', {}).keys():
        temp = fields.get(k)
        if temp:
            pos.add(temp)
    
    doc['pos'] = list(pos)
    
    return doc

In [15]:
def filter_output(doc, exclude_fields):

    return {k: v for k, v in doc.items() if k not in exclude_fields}

# Get data

In [16]:
if not os.path.isfile("data/itwiktionary.xml"):
    download_unzip_dataset()

2020-06-02 15:14:37,535 - INFO - Created directory 'data'
2020-06-02 15:14:37,536 - INFO - Downloading from https://dumps.wikimedia.org/itwiktionary/latest/itwiktionary-latest-pages-articles-multistream.xml.bz2
2020-06-02 15:15:16,435 - INFO - Saving to data/itwiktionary.xml.bz2
2020-06-02 15:15:16,474 - INFO - Uncompressing
2020-06-02 15:15:28,419 - INFO - Output in data/itwiktionary.xml


# Read data

In [17]:
wikitionary = read_xml("data/itwiktionary.xml")

2020-06-02 15:16:58,982 - INFO - Read 1 items from data/itwiktionary.xml


In [18]:
pages = filter_pages(wikitionary)

2020-06-02 15:17:00,360 - INFO - Kept 361992 out of 533815 pages


# Extract single

In [19]:
doc = [p for p in pages if p['raw']['title']=='giovedì'][0]
doc.keys()

dict_keys(['raw'])

In [20]:
doc = extract_title(doc)
doc.keys()

dict_keys(['raw', 'title', 'multiword'])

In [21]:
doc = extract_text(doc)

In [22]:
doc = extract_vedi(doc)
doc.get('vedi')

In [23]:
doc = preprocess_text(doc)

In [24]:
doc = extract_fields_link(doc, extract_fields)

In [25]:
doc = extract_pos(doc, part_sentence)

In [26]:
doc.keys()

dict_keys(['raw', 'title', 'multiword', 'text', 'proc_text', 'relazioni', 'varianti', 'pos'])

In [27]:
doc = filter_output(doc, exclude_fields)

In [28]:
doc

{'title': 'giovedì',
 'multiword': False,
 'relazioni': ['settimana',
  'anno',
  'mercoledì',
  'notte',
  'mese',
  'sabato',
  'martedì',
  'giorno',
  'domenica',
  'venerdì',
  'lunedì'],
 'varianti': ['giovedí'],
 'pos': ['sostantivo']}

# Extract pipeline

In [29]:
create_dir('data')

In [30]:
parsed = []
for doc in pages:
    doc = extract_title(doc)
    doc = extract_text(doc)
    doc = extract_vedi(doc)
    doc = preprocess_text(doc)
    doc = extract_fields_link(doc, extract_fields)
    doc = extract_pos(doc, part_sentence)
    doc = filter_output(doc, exclude_fields)
    parsed.append(doc)

In [31]:
pd.DataFrame(parsed).to_csv('data/wiktionary.csv', index=False)

# Extract pipeline (Dask)