In [25]:
def read_poem(file_name):
    with open(file_name, 'r', encoding="utf-8") as f:
        return f.read().splitlines()

ferdowsi_lines = read_poem('train_set/ferdowsi_train.txt')
hafez_lines = read_poem('train_set/hafez_train.txt')
molana_lines = read_poem('train_set/molavi_train.txt')

In [26]:
print(f"Ferdowsi length: {len(ferdowsi_lines)}")
print(f"Hafez length: {len(hafez_lines)}")
print(f"Molana length: {len(molana_lines)}")

Ferdowsi length: 9000
Hafez length: 7700
Molana length: 8000


In [27]:
all_lines = ferdowsi_lines + hafez_lines + molana_lines
p_ferdowsdi = len(ferdowsi_lines) / len(all_lines)
p_hafez = len(hafez_lines) / len(all_lines)
p_molana = len(molana_lines) / len(all_lines)

In [28]:
print(f"Ferdowsi probability: {p_ferdowsdi}")
print(f"Hafez probability: {p_hafez}")
print(f"Molana probability: {p_molana}")

Ferdowsi probability: 0.3643724696356275
Hafez probability: 0.3117408906882591
Molana probability: 0.32388663967611336


In [43]:
all_lines_with_token = [f"<S> {l} </S>" for l in all_lines]
ferdowsi_lines_with_token = [f"<S> {l} </S>" for l in ferdowsi_lines]
hafez_lines_with_token = [f"<S> {l} </S>" for l in hafez_lines]
molana_lines_with_token = [f"<S> {l} </S>" for l in molana_lines]

In [44]:
print(all_lines_with_token[:3])
print(ferdowsi_lines_with_token[:3])
print(hafez_lines_with_token[:3])
print(molana_lines_with_token[:3])

['<S> جهان چون بزاری برآید همی </S>', '<S> بدو نیک روزی سرآید همی </S>', '<S> چو بستی کمر بر در راه آز </S>']
['<S> جهان چون بزاری برآید همی </S>', '<S> بدو نیک روزی سرآید همی </S>', '<S> چو بستی کمر بر در راه آز </S>']
['<S> الا یا ایها الساقی ادر کاسا و ناولها </S>', '<S> که عشق آسان نمود اول ولی افتاد مشکل\u200cها </S>', '<S> به بوی نافه\u200cای کاخر صبا زان طره بگشاید </S>']
['<S> بشنو از نی ، چون حکایت می\u200cکند </S>', '<S> واز جدائی\u200cها شکایت می\u200cکند </S>', '<S> کز نیستان تا مرا ببریده اند </S>']


In [45]:
all_words = " ".join(all_lines_with_token).split(" ")
ferdowsi_words = " ".join(ferdowsi_lines_with_token).split(" ")
hafez_words = " ".join(hafez_lines_with_token).split(" ")
molana_words = " ".join(molana_lines_with_token).split(" ")

In [46]:
print(all_words[:10])
print(ferdowsi_words[:10])
print(hafez_words[:10])
print(molana_words[:10])

['<S>', 'جهان', 'چون', 'بزاری', 'برآید', 'همی', '</S>', '<S>', 'بدو', 'نیک']
['<S>', 'جهان', 'چون', 'بزاری', 'برآید', 'همی', '</S>', '<S>', 'بدو', 'نیک']
['<S>', 'الا', 'یا', 'ایها', 'الساقی', 'ادر', 'کاسا', 'و', 'ناولها', '</S>']
['<S>', 'بشنو', 'از', 'نی', '،', 'چون', 'حکایت', 'می\u200cکند', '</S>', '<S>']


In [34]:
all_words_set = set(all_words)
print(len(all_words_set))

17158


In [35]:
dictionary = [w for w in all_words_set if all_words.count(w) >= 2]
len(dictionary)

8075

In [50]:
# 0: ferdowsi, 1: hafez, 2: molana
unigram_p = [{}, {}, {}]
for w in dictionary:
    unigram_p[0][w] = ferdowsi_words.count(w) / len(ferdowsi_words)
    unigram_p[1][w] = hafez_words.count(w) / len(hafez_words)
    unigram_p[2][w] = molana_words.count(w) / len(molana_words)

In [52]:
def create_bigrams(lines):
    return [b for l in lines for b in zip(l.split(" ")[:-1], l.split(" ")[1:])]
    
# 0: ferdowsi, 1: hafez, 2: molana
bigrams = []
bigrams.append(create_bigrams(ferdowsi_lines_with_token))
bigrams.append(create_bigrams(hafez_lines_with_token))
bigrams.append(create_bigrams(molana_lines_with_token))

In [61]:
def calculate_bigram_probability(lines_with_token, bigram, dictionary):
    bigram_p = {}
    lines_joined = " ".join(lines_with_token)
    for couple in bigram:
        if (couple[0] in dictionary) and (couple[1] in dictionary) and (couple not in bigram_p):
            denominator = lines_joined.count(couple[0])
            if denominator != 0:
                bigram_p[couple] = lines_joined.count(" ".join(couple)) / denominator
    
    return bigram_p

bigram_p = []
bigram_p.append(calculate_bigram_probability(ferdowsi_lines_with_token, bigrams[0], dictionary))
bigram_p.append(calculate_bigram_probability(hafez_lines_with_token, bigrams[1], dictionary))
bigram_p.append(calculate_bigram_probability(molana_lines_with_token, bigrams[2], dictionary))