In [83]:
import sys
import csv
import json
import random
from typing import List
from pathlib import Path

In [84]:
maxInt = sys.maxsize
decrement = True

while decrement:
    # decrease the maxInt value by factor 10 
    # as long as the OverflowError occurs.
    decrement = False
    try:
        csv.field_size_limit(maxInt)
    except OverflowError:
        maxInt = int(maxInt / 10)
        decrement = True

csv.field_size_limit(maxInt)


9223372036854775807

### Random HTML generation

In [85]:
def add_random_html_to_data(doc: str):
    tags = ['p', 'div', 'html', 'body', 'span']
    
    tag = random.choice(tags)
    opening = f'<{tag}>'
    closing = f'</{tag}>'
    
    split_doc = doc.split(' ')
    doc_words = len(split_doc)
    start = random.randrange(0, doc_words)
    end = random.randrange(start + 2, doc_words + 2)
    split_doc.insert(start, opening)
    split_doc.insert(end, closing)

    new_doc = ' '.join(split_doc)
    return new_doc

### Read files and transform

In [86]:
def read_file(path: Path):
    print(path.exists(), path.is_dir(), path.is_file())
    if path.exists() and path.is_file():
        docs = []
        with path.open() as f:
            spamreader = csv.reader(f, delimiter=' ', quotechar='"')
            for row in spamreader:
                html_doc = add_random_html_to_data(', '.join(row))
                docs.append(html_doc)
        return docs

In [87]:
et = Path('test_et.csv')
ru = Path('test_ru.csv')
en = Path('test_en.csv')

In [88]:
est_docs = read_file(et)
ru_docs = read_file(ru)
en_docs = read_file(en)

True False True
True False True
True False True


### Add HTML pages to the docs

In [89]:
def read_html_page(path: Path):
    if path.exists() and path.is_file():
        with path.open() as f:
            data = f.read()
        return data
    else:
        raise Exception(f'Path exists: {path.exists()}, path is a file: {path.is_file()}')

In [103]:
html_pages = []
html_pages.append(read_html_page(Path('html/audio.htm')))
html_pages.append(read_html_page(Path('html/canvas.htm')))
html_pages.append(read_html_page(Path('html/google.htm')))
html_pages.append(read_html_page(Path('html/examples.htm')))
html_pages.append(read_html_page(Path('html/plugins.htm')))
html_pages.append(read_html_page(Path('html/yt.htm')))


In [104]:
def add_docs_and_key(docs: List[str], other_docs: List[str], key: str):
    docs += other_docs
    random.shuffle(docs)
    new_docs = []
    for doc in docs:
        split_key = key.split('.')
        split_key.reverse()
        for sk in split_key:
            doc = {sk: doc}
        new_docs.append(doc)
    return new_docs

In [105]:
# keyed_et_docs = add_docs_and_key(est_docs, html_pages, 'est.mlp.text')
# keyed_ru_docs = add_docs_and_key(ru_docs, html_pages, 'ru.text.mlp.text')
# keyed_en_docs = add_docs_and_key(en_docs, html_pages, 'en-text.mlp.lemmas.text')

In [118]:
combined_docs = add_docs_and_key(est_docs + ru_docs + en_docs, html_pages, 'foo.bar.text')

##### est.mlp.text | ru.text.mlp.text | en.text.mlp.text

In [106]:
def combine_to_jl(docs1, docs2):
    result = []
    for doc1, doc2 in zip(docs1, docs2):
        doc1.update(doc2)
        result.append(doc1)
    return result
            
        
result = combine_to_jl(keyed_et_docs, keyed_ru_docs)
result = combine_to_jl(result, keyed_en_docs)


In [107]:
result[0].keys(), len(result)

(dict_keys(['est', 'ru', 'en-text']), 1013)

### Save the result

In [108]:
def save(path: Path, docs: List[str]):
    with path.open(mode='w', encoding='utf-8') as f:
        for line in docs:
            f.write(json.dumps(line) + '\n')

In [109]:
result_path = Path('result_small.jl')
save(result_path, result[0:200])

In [117]:
result[0].keys()

dict_keys(['est', 'ru', 'en-text'])

In [124]:
result_path = Path('combined_small.jl')
save(result_path, combined_docs[0:1000])