# JSON Conversion
Author: Pierre Nugues

Granska's lexical resources are stored in a slightly inconsistent TSV format. In this notebook, we convert them in the more maintainable JSON format without altering their structure. 

## The modules

In [None]:
import json
import regex as re
import copy

## The resource names and locations

The source and destination folders

In [None]:
src_folder = '../../lex/'
dest_folder = '../../lex/'

The `morfs` folder and its files

In [None]:
morfs_folder = 'morfs/'
morfs_files = ['cw', 'cwt']

The `tags` folder and files

In [None]:
tags_folder = 'tags/'
tags_files = ['ct', 'ctm', 'ctt', 'cttt', 'features', 'taginfo']

And the `word` folder

In [None]:
words_folder = 'words/'
words_files = ['bitransitivaverb', 'compound-begin-ok.w', 'compound-end-stop.w',
              'cw', 'cwtl', 'feminina', 'foreign.w', 'inflection.lex', 'inflection.rules',
              'intransitivaverb', 'opt_space_words', 'spellNotOK', 'spellOK']

## JSON conversion

### `morfs`

The structure of the `morfs` files is easy. Each line has a number and one or two tokens. We store them as a list.

In [None]:
for file in morfs_files:
    data = open(src_folder + morfs_folder + file, encoding='utf-8').read().strip()
    lines = re.split('[\r\n]+', data)
    new_lines = []
    for line in lines:
        fields = re.split('[\t]+', line.strip())
        fields = list(map(str.strip, fields))
        fields[0] = int(fields[0])
        new_lines += [fields]
    new_lines = {file: new_lines}
    fp = open(dest_folder + morfs_folder + file + '.json', 'w', encoding='utf-8')
    json.dump(new_lines, fp, indent=2, ensure_ascii=False)
    fp.close()

### `tags`

Outside `features`, we have a flat file struture and we convert it as for `morfs`

In [None]:
tags_files.remove('features')
for file in tags_files:
    data = open(src_folder + tags_folder + file, encoding='utf-8').read().strip()
    lines = re.split('[\r\n]+', data)
    new_lines = []
    for line in lines:
        fields = re.split('[ \t]+', line.strip())
        fields = list(map(str.strip, fields))
        if file != 'taginfo':
            fields[0] = int(fields[0])
        new_lines += [fields]
    new_lines = {file: new_lines}
    fp = open(dest_folder + tags_folder + file + '.json', 'w', encoding='utf-8')
    json.dump(new_lines, fp, indent=4, ensure_ascii=False)
    fp.close()

The `features` file has a two-level structure. We parse it with a two-pass procedure. In the first pass, we store the indices of the first level and in the second pass we process the second level. The first level starts with a `*` and corresponds to a feature category and its translation in Swedish.

In [None]:
data = open(src_folder + tags_folder + 'features', encoding='utf-8').read().strip()
lines = re.split('[\r\n]+', data)
first_level_idx = []
for i, line in enumerate(lines):
    if line[0] == '*':
       first_level_idx += [i] 

We now process the second level that consists of pairs. We encode them as dictionaries.

In [None]:
features = {}
for i, line in enumerate(lines):
    fields = re.split('\t+', line.strip())
    fields = list(map(str.strip, fields))
    if i in first_level_idx:
        name = fields[0][1:]
        features[name] = {}
        pos_feat_name = name
        features[name]['swedish_transl'] = fields[1]
        features[name]['values'] = {}
    else:
        features[pos_feat_name]['values'][fields[0]] = fields[1]

And we store the JSON file

In [None]:
fp = open(dest_folder + tags_folder + 'features.json', 'w', encoding='utf-8')
json.dump(features, fp, indent=2, ensure_ascii=False)
fp.close()

### `words`

In this folder, some files are just lists of words or phrases. We encode them as JSON lists

In [None]:
words_files = ['bitransitivaverb', 'compound-begin-ok.w', 'compound-end-stop.w',
              'feminina', 'foreign.w', 'intransitivaverb', 'opt_space_words', 'spellNotOK', 'spellOK']

In [None]:
for file in words_files:
    data = open(src_folder + words_folder + file, encoding='utf-8').read().strip()
    lines = re.split('[\r\n]+', data)
    new_lines = [line.strip() for line in lines]
    new_lines = {file: new_lines}
    fp = open(dest_folder + words_folder + file + '.json', 'w', encoding='utf-8')
    json.dump(new_lines, fp, indent=2, ensure_ascii=False)
    fp.close()

Three other files have fields separated by tabulations. We encode them as a list of li.. For these files, a few lines need to be corrected manually as the original files, sometimes use tabulations as separator and sometimes, spaces. There is also a character that is not in the Latin 1 register.

In [None]:
words_files = ['cw', 'cwtl', 'inflection.lex']

In [None]:
for file in words_files:
    data = open(src_folder + words_folder + file, encoding='utf-8').read().strip()
    lines = re.split('[\r\n]+', data)
    new_lines = []
    for line in lines:
        fields = re.split('[ \t]+', line.strip())
        fields = list(map(str.strip, fields))
        if file != 'inflection.lex':
            fields[0] = int(fields[0])
        new_lines += [fields]
    new_lines = {file: new_lines}
    fp = open(dest_folder + words_folder + file + '.json', 'w', encoding='utf-8')
    json.dump(new_lines, fp, indent=2, ensure_ascii=False)
    fp.close()

Finally `inflection.rules`has a two level-structure that we convert with a two-pass procedure. The first-level lines start with a `$`

In [None]:
data = open(src_folder + words_folder + 'inflection.rules', encoding='utf-8').read().strip()
lines = re.split('[\r\n]+', data)
first_level_idx = []
for i, line in enumerate(lines):
    if line[0] == '$':
       first_level_idx += [i] 

We process the second level. We create dictionaries, where the keys will be pair of paradigm and suffixes

In [None]:
rule_list = []
for i, line in enumerate(lines):
    fields = re.split('\t+', line.strip())
    fields = list(map(str.strip, fields))
    if i in first_level_idx:
        if i != 0:
            rule_list += [copy.deepcopy(new_dict)]
        new_dict = {}
        fields[0] = fields[0][1:]
        new_dict['feat_infl'] = fields[:]
        new_dict['paradigm'] = []
    else:
        new_dict['paradigm'] += [fields[:]]
rule_list += [copy.deepcopy(new_dict)]

We create the top level

In [None]:
inflection_rules = {}
inflection_rules['inflection.rules'] = rule_list

And we store the rules in a JSON file

In [None]:
fp = open(dest_folder + words_folder + 'inflection.rules.json', 'w', encoding='utf-8')
json.dump(inflection_rules, fp, indent=2, ensure_ascii=False)
fp.close()