In [124]:
import nltk
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder
import json
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import pandas as pd

In [125]:
with open('../dataset/en.json', 'r') as file:
    en = json.load(file)
with open('../dataset/ru.json', 'r') as file:
    ru = json.load(file)
with open('../dataset/kaz.json', 'r') as file:
    kz = json.load(file)

In [126]:
stop_words_en = set(stopwords.words('english'))
stop_words_ru = set(stopwords.words('russian'))
stop_words_kz = set(stopwords.words('kazakh'))

In [127]:
def get_chapters(dictionary):
    chapters = []
    for part, _ in dictionary.items():
        for chapter in dictionary[part]:
            chapters.append(chapter)
    return chapters
kz_chapters_part1, kz_chapters_part2 = get_chapters(kz)[2:9], get_chapters(kz)[10:]
en_chapters_part1, en_chapters_part2 = get_chapters(en)[:7], get_chapters(en)[7:]
ru_chapters_part1, ru_chapters_part2 = get_chapters(ru)[:7], get_chapters(ru)[7:]


In [136]:
def calculate_collocations(book_data, part, top_n, language):
    # Flatten lists of lists
    flatten = lambda l: [item for sublist in l for item in sublist]

    # Combine tokens for each part and lowercase them
    part_tokens = flatten([flatten(chapter) for chapter in book_data[part].values()])
    part_tokens_lower = [word.lower() for word in part_tokens]

    # Remove stop words
    if language == 'english':
        stop_words = set(stopwords.words('english'))
    elif language == 'russian':
        stop_words = set(stopwords.words('russian'))
    elif language == 'kazakh':
        stop_words = set(stopwords.words('kazakh'))
    else:
        raise ValueError("Invalid language. Supported languages are 'english', 'russian', and 'kazakh'.")

    # Tokenize and remove stop words
    part_words = [word for word in part_tokens_lower if word not in stop_words]

    # Calculate bigram collocations using PMI
    finder = BigramCollocationFinder.from_words(part_words, window_size=5)
    bigram_measures = BigramAssocMeasures()

    # Get top collocations for each part
    collocations = finder.nbest(bigram_measures.chi_sq, top_n)
    return collocations

In [137]:
en_part1 = calculate_collocations(en, "part1", 10, 'english')
en_part2 = calculate_collocations(en, "part2", 10, 'english')
kz_part1 = calculate_collocations(kz, "part1", 10, 'kazakh')
kz_part2 = calculate_collocations(kz, "part2", 10, 'kazakh')
ru_part1 = calculate_collocations(ru, "part1", 10, 'russian')
ru_part2 = calculate_collocations(ru, "part2", 10, 'russian')
# Create DataFrames for each part and language
df_en_part1 = pd.DataFrame(en_part1, columns=['word1_en', 'word2_en'])
df_en_part2 = pd.DataFrame(en_part2, columns=['word1_en', 'word2_en'])
df_kz_part1 = pd.DataFrame(kz_part1, columns=['word1_kz', 'word2_kz'])
df_kz_part2 = pd.DataFrame(kz_part2, columns=['word1_kz', 'word2_kz'])
df_ru_part1 = pd.DataFrame(ru_part1, columns=['word1_ru', 'word2_ru'])
df_ru_part2 = pd.DataFrame(ru_part2, columns=['word1_ru', 'word2_ru'])

book_1 = pd.concat([df_en_part1, df_kz_part1, df_ru_part1], axis=1)
book_2 = pd.concat([df_en_part2, df_kz_part2, df_ru_part2], axis=1)

In [138]:
book_1.head(25)

Unnamed: 0,word1_en,word2_en,word1_kz,word2_kz,word1_ru,word2_ru
0,aga,sped,төбеңнен,ұрсын,ата,лепетала
1,kiss,himshe,қорқа,бұға,главой,дуана
2,wider,stalked,азды,қуайық,аулаул,курке
3,ata,angen,алуда,оңайдан,больном,простуда
4,ata,sped,бақа,жібердім,входите,стоявшего
5,enongn,arasha,бақа,көйлегіңе,выскочили,налетай
6,scorpion,lustily,бұға,етекбасты,действовала,айт
7,apa,azheh,бәрәқалла,архамәррахимин,дразнить,трус
8,aa,rights,бәрәқалла,бирахматиқа,желкуйин,кличка
9,aad,thewe,елім,елімауелім,задумчивый,побежден


In [139]:
book_2.head(25)

Unnamed: 0,word1_en,word2_en,word1_kz,word2_kz,word1_ru,word2_ru
0,chicks,ata,түф,түкрген,стебли,шелестят
1,inshalla,amen,апарам,уәдем,лезь,брюхо
2,accuse,grandfather,бердк,әдлңд,агатай,спрыгивает
3,accuse,personally,болыппыз,ашуыңды,благословенна,иншалла
4,becomes,hotter,брбрақ,тұрғыға,богачи,властвовать
5,creation,woe,брңнен,малдарың,боль,агатай
6,limb,darkemhbai,бүрктшнң,көшке,дои,сбивай
7,lord,woe,бөрктер,шалбарын,заходи,приятель
8,mason,accomplished,жүрш,аба,защищаю,целился
9,seize,limb,мүдде,брмз,испуг,агатай


In [140]:
def calculate_collocations_by_chapter(book_data, part, top_n, language):
    # Flatten lists of lists
    flatten = lambda l: [item for sublist in l for item in sublist]

    # Initialize dictionary to store collocations by chapter
    collocations_by_chapter = {}

    # Combine tokens for each part and lowercase them
    for chapter_num, chapter_data in book_data[part].items():
        chapter_tokens = flatten(chapter_data)
        chapter_tokens_lower = [word.lower() for word in chapter_tokens]

        # Remove stop words
        if language == 'english':
            stop_words = set(stopwords.words('english'))
        elif language == 'russian':
            stop_words = set(stopwords.words('russian'))
        elif language == 'kazakh':
            stop_words = set(stopwords.words('kazakh'))
        else:
            raise ValueError("Invalid language. Supported languages are 'english', 'russian', and 'kazakh'.")

        # Tokenize and remove stop words
        chapter_words = [word for word in chapter_tokens_lower if word not in stop_words]

        # Calculate bigram collocations using PMI
        finder = BigramCollocationFinder.from_words(chapter_words, window_size=5)
        bigram_measures = BigramAssocMeasures()

        # Get top collocations for the chapter
        collocations = finder.nbest(bigram_measures.chi_sq, top_n)

        # Store collocations in the dictionary
        collocations_by_chapter[chapter_num] = collocations

    return collocations_by_chapter

In [141]:
en_part1 = calculate_collocations_by_chapter(en, "part1", 10, 'english')
en_part2 = calculate_collocations_by_chapter(en, "part2", 10, 'english')
kz_part1 = calculate_collocations_by_chapter(kz, "part1", 10, 'kazakh')
kz_part2 = calculate_collocations_by_chapter(kz, "part2", 10, 'kazakh')
ru_part1 = calculate_collocations_by_chapter(ru, "part1", 10, 'russian')
ru_part2 = calculate_collocations_by_chapter(ru, "part2", 10, 'russian')
# # Create DataFrames for each part and language
# df_en_part1 = pd.DataFrame(en_part1, columns=['word1_en', 'word2_en'])
# df_en_part2 = pd.DataFrame(en_part2, columns=['word1_en', 'word2_en'])
# df_kz_part1 = pd.DataFrame(kz_part1, columns=['word1_kz', 'word2_kz'])
# df_kz_part2 = pd.DataFrame(kz_part2, columns=['word1_kz', 'word2_kz'])
# df_ru_part1 = pd.DataFrame(ru_part1, columns=['word1_ru', 'word2_ru'])
# df_ru_part2 = pd.DataFrame(ru_part2, columns=['word1_ru', 'word2_ru'])

# book_1 = pd.concat([df_en_part1, df_kz_part1, df_ru_part1], axis=1)
# book_2 = pd.concat([df_en_part2, df_kz_part2, df_ru_part2], axis=1)

In [142]:
ru_part2

{'перед_бродом': [('иншалла', 'аминь'),
  ('лезь', 'брюхо'),
  ('стебли', 'шелестят'),
  ('задрожал', 'выстрелил'),
  ('аминь', 'подхватили'),
  ('аминь', 'старейшины'),
  ('ата', 'приедет'),
  ('бедная', 'виновата'),
  ('благословенна', 'иншалла'),
  ('божью', 'тропу')],
 'на_жайляу': [('агатай', 'спрыгивает'),
  ('аминь', 'бабушка'),
  ('аминь', 'благоговейно'),
  ('богачи', 'властвовать'),
  ('боевой', 'иду'),
  ('бок', 'начинают'),
  ('бок', 'расходиться'),
  ('боль', 'агатай'),
  ('весны', 'бок'),
  ('властвовать', 'незабываемым')],
 'взгорьями': [('овцы', 'башка'),
  ('башка', 'рабы'),
  ('будьте', 'счастливы'),
  ('всеобщей', 'истории'),
  ('жадной', 'жди'),
  ('жалости', 'жди'),
  ('жди', 'губит'),
  ('жди', 'жесток'),
  ('жди', 'приход'),
  ('идеальный', 'справедлив')],
 'по_рытвинам': [('аксарбас', 'шумной'),
  ('болтай', 'бесы'),
  ('болтай', 'подсказывают'),
  ('везде', 'живут'),
  ('везде', 'казахи'),
  ('волна', 'безысходности'),
  ('волна', 'непреодолимой'),
  ('горящей'