# Import Libs

In [5]:
from nltk import word_tokenize, sent_tokenize
import os, re

# EN

# Text preprocessing

In [2]:
chapter1 = ['the_home_coming', 'the_whirlwind', 'on_the_road', 'in_the_thickets', 'along_the_foothills', 'the_ascent', 'in_the_heights']
chapter2 = ['the_ford', 'the_foothills', 'over_the_ruts', 'through_the_pass', 'at_the_crossroads', 'at_the_summit', 'epilogue']

In [35]:
def tokenize_text(text):
    tokenized_text = []
    for sentence in sent_tokenize(text):
        cleaned_sentence = re.sub(r'[^a-zA-Z\s]', '', sentence)
        tokenized_text.append(list(word_tokenize(cleaned_sentence)))
    return tokenized_text

In [46]:
def chapters_to_json(dir: str):
    dict = {}
    with open(f'{dir}/foreword.txt', 'r') as f:
        dict['foreword'] = tokenize_text(f.read())

    chapter1_dict = {}
    for chapter in chapter1:
        with open(f'{dir}/{chapter}.txt', 'r') as f:
            chapter1_dict[chapter] = tokenize_text(f.read())
    dict['part1'] = chapter1_dict

    chapter2_dict = {}
    for chapter in chapter2:
        with open(f'{dir}/{chapter}.txt', 'r') as f:
            chapter2_dict[chapter] = tokenize_text(f.read())
    dict['part2'] = chapter2_dict
    return dict

In [47]:
chapters_json = chapters_to_json('en')
with open('data.json', 'w') as f:
    f.write(str(chapters_json))

# KAZ

# Text preprocessing

In [7]:
def tokenize_text_kaz(text):
    tokenized_text = []
    for sentence in sent_tokenize(text):
        cleaned_sentence = re.sub(r'[^а-яА-ЯәӘғҒқҚңҢөӨұҰүҮіІ\s]', '', sentence)
        tokenized_text.append(list(word_tokenize(cleaned_sentence)))
    return tokenized_text

In [8]:
part1 = {
    'kaitkanda': 'қайтқанда',
    'kat_kabatta': 'қат-қабатта',
    'zholda': 'жолда',
    'shytyrmanda': 'шытырманда',
    'bel_beleste': 'бел-белесте',
    'orde': 'өрде',
    'kyiada': 'қияда'
}

In [9]:
part2 = {
    'taigakta': 'тайғақта', 
    'zhailauda': 'жайлауда', 
    'enyste': 'еңісте', 
    'okapta': 'оқапта', 
    'asuda': 'асуда', 
    'tarauda': 'тарауда', 
    'biykte': 'биікте', 
    'epilogue': 'эпилог'
}

In [10]:
part3 = {
    'abai_aga': 'абай аға',
    'kek_zholynda': 'кек жолында',
    'karashygyn': 'қарашығын',
    'okinishte': 'өкініште',
    'kaktygysta': 'қақтығыста',
    'korshauda': 'қоршауда'
}

In [11]:
part4 = {
    'tun_tunekte': 'түн-түнекте',
    'kuz_kiyada': 'құз-қияда',
    'kapada': 'қапада',
    'kastykta': 'қастықта',
    'shaikasta': 'шайқаста',
    'zhutta': 'жұтта',
    'epilogue': 'эпилог'
}

In [12]:
def create_files(path,part):
    for name in part:
        with open(f"{path}/{name}.txt", 'w') as f:
            pass

In [27]:
create_files('kaz/part3', part3)

In [13]:
def chapters_to_json(dir: str, part1, part2, part3, part4):
    dict = {}
    
    dict['chapters_in_kaz'] = {
        'part1': part1,
        'part2': part2,
        'part3': part3,
        'part4': part4
    }

    part1_dict = {}
    for chapter in part1:
        with open(f'{dir}/part1/{chapter}.txt', 'r') as f:
            part1_dict[chapter] = tokenize_text_kaz(f.read())
    dict['part1'] = part1_dict

    part2_dict = {}
    for chapter in part2:
        with open(f'{dir}/part2/{chapter}.txt', 'r') as f:
            part1_dict[chapter] = tokenize_text_kaz(f.read())
    dict['part2'] = part2_dict

    part3_dict = {}
    for chapter in part3:
        with open(f'{dir}/part3/{chapter}.txt', 'r') as f:
            part1_dict[chapter] = tokenize_text_kaz(f.read())
    dict['part3'] = part3_dict

    part4_dict = {}
    for chapter in part4:
        with open(f'{dir}/part4/{chapter}.txt', 'r') as f:
            part4_dict[chapter] = tokenize_text_kaz(f.read())
    dict['part4'] = part4_dict

    return dict

In [14]:
kaz = chapters_to_json('kaz', part1, part2, part3, part4)

In [15]:
with open('kaz/kaz.json', 'w') as f:
    f.write(str(kaz))