In [25]:
def read_poem(file_name):
    with open(file_name, 'r', encoding="utf-8") as f:
        return f.read().splitlines()

ferdowsi_lines = read_poem('train_set/ferdowsi_train.txt')
hafez_lines = read_poem('train_set/hafez_train.txt')
molana_lines = read_poem('train_set/molavi_train.txt')

In [26]:
print(f"Ferdowsi length: {len(ferdowsi_lines)}")
print(f"Hafez length: {len(hafez_lines)}")
print(f"Molana length: {len(molana_lines)}")

Ferdowsi length: 9000
Hafez length: 7700
Molana length: 8000


In [27]:
all_lines = ferdowsi_lines + hafez_lines + molana_lines
p_ferdowsdi = len(ferdowsi_lines) / len(all_lines)
p_hafez = len(hafez_lines) / len(all_lines)
p_molana = len(molana_lines) / len(all_lines)

In [28]:
print(f"Ferdowsi probability: {p_ferdowsdi}")
print(f"Hafez probability: {p_hafez}")
print(f"Molana probability: {p_molana}")

Ferdowsi probability: 0.3643724696356275
Hafez probability: 0.3117408906882591
Molana probability: 0.32388663967611336


In [43]:
all_lines_with_token = [f"<S> {l} </S>" for l in all_lines]
ferdowsi_lines_with_token = [f"<S> {l} </S>" for l in ferdowsi_lines]
hafez_lines_with_token = [f"<S> {l} </S>" for l in hafez_lines]
molana_lines_with_token = [f"<S> {l} </S>" for l in molana_lines]

In [44]:
print(all_lines_with_token[:3])
print(ferdowsi_lines_with_token[:3])
print(hafez_lines_with_token[:3])
print(molana_lines_with_token[:3])

['<S> جهان چون بزاری برآید همی </S>', '<S> بدو نیک روزی سرآید همی </S>', '<S> چو بستی کمر بر در راه آز </S>']
['<S> جهان چون بزاری برآید همی </S>', '<S> بدو نیک روزی سرآید همی </S>', '<S> چو بستی کمر بر در راه آز </S>']
['<S> الا یا ایها الساقی ادر کاسا و ناولها </S>', '<S> که عشق آسان نمود اول ولی افتاد مشکل\u200cها </S>', '<S> به بوی نافه\u200cای کاخر صبا زان طره بگشاید </S>']
['<S> بشنو از نی ، چون حکایت می\u200cکند </S>', '<S> واز جدائی\u200cها شکایت می\u200cکند </S>', '<S> کز نیستان تا مرا ببریده اند </S>']


In [45]:
all_words = " ".join(all_lines_with_token).split(" ")
ferdowsi_words = " ".join(ferdowsi_lines_with_token).split(" ")
hafez_words = " ".join(hafez_lines_with_token).split(" ")
molana_words = " ".join(molana_lines_with_token).split(" ")

In [46]:
print(all_words[:10])
print(ferdowsi_words[:10])
print(hafez_words[:10])
print(molana_words[:10])

['<S>', 'جهان', 'چون', 'بزاری', 'برآید', 'همی', '</S>', '<S>', 'بدو', 'نیک']
['<S>', 'جهان', 'چون', 'بزاری', 'برآید', 'همی', '</S>', '<S>', 'بدو', 'نیک']
['<S>', 'الا', 'یا', 'ایها', 'الساقی', 'ادر', 'کاسا', 'و', 'ناولها', '</S>']
['<S>', 'بشنو', 'از', 'نی', '،', 'چون', 'حکایت', 'می\u200cکند', '</S>', '<S>']


In [34]:
all_words_set = set(all_words)
print(len(all_words_set))

17158


In [35]:
dictionary = [w for w in all_words_set if all_words.count(w) >= 2]
len(dictionary)

8075

In [50]:
# 0: ferdowsi, 1: hafez, 2: molana
unigram_p = [{}, {}, {}]
for w in dictionary:
    unigram_p[0][w] = ferdowsi_words.count(w) / len(ferdowsi_words)
    unigram_p[1][w] = hafez_words.count(w) / len(hafez_words)
    unigram_p[2][w] = molana_words.count(w) / len(molana_words)

In [52]:
def create_bigrams(lines):
    return [b for l in lines for b in zip(l.split(" ")[:-1], l.split(" ")[1:])]
    
# 0: ferdowsi, 1: hafez, 2: molana
bigrams = []
bigrams.append(create_bigrams(ferdowsi_lines_with_token))
bigrams.append(create_bigrams(hafez_lines_with_token))
bigrams.append(create_bigrams(molana_lines_with_token))

In [61]:
def calculate_bigram_probability(lines_with_token, bigram, dictionary):
    bigram_p = {}
    lines_joined = " ".join(lines_with_token)
    for couple in bigram:
        if (couple[0] in dictionary) and (couple[1] in dictionary) and (couple not in bigram_p):
            denominator = lines_joined.count(couple[0])
            if denominator != 0:
                bigram_p[couple] = lines_joined.count(" ".join(couple)) / denominator
    
    return bigram_p

bigram_p = []
bigram_p.append(calculate_bigram_probability(ferdowsi_lines_with_token, bigrams[0], dictionary))
bigram_p.append(calculate_bigram_probability(hafez_lines_with_token, bigrams[1], dictionary))
bigram_p.append(calculate_bigram_probability(molana_lines_with_token, bigrams[2], dictionary))

In [110]:
X_test = []
Y_test = []

with open("test_set/test_file.txt", 'r', encoding="utf-8") as f:
    for line in f.read().splitlines():
        y, x = line.strip().split("\t")
        X_test.append(f"<S> {x} </S>")
        Y_test.append(int(y) - 1)

In [111]:
print(X_test[:3])
print(Y_test[:3])

['<S> وزان جایگه نالهٔ گاودم </S>', '<S> شنیدند و آواز رویینه خم </S>', '<S> جهاندار بیدار لشکر براند </S>']
[0, 0, 0]


In [98]:
def backoff_model(line_with_token, unigram_p, bigram_p, lambdas, epsilon):
    assert abs(lambdas[0] + lambdas[1] + lambdas[2] - 1) < 1e-5
    
    splitted_line = line_with_token.split(" ")
    unigram = splitted_line
    bigram = [b for b in zip(splitted_line[:-1], splitted_line[1:])]
    
    probabilities = [1 for i in range(3)]
    for couple in bigram:
        for i in range(3):
            probabilities[i] *= lambdas[0] * bigram_p[i].get(couple, 0) + lambdas[1] * unigram_p[i].get(couple[1], 0) + lambdas[2] * epsilon
    
    return probabilities

In [99]:
def arg_max(input_list):
    return max(zip(input_list, range(len(input_list))))[1]

In [100]:
def predict(line_with_token, unigram_p, bigram_p, lambdas, epsilon):
    probabilities = backoff_model(line_with_token, unigram_p, bigram_p, lambdas, epsilon)
    return arg_max(probabilities)

In [104]:
predict(X_test[0], unigram_p, bigram_p, (0.8, 0.15, 0.05), 1e-3)

0

In [105]:
def model_accuracy(X_test, Y_test, unigram_p, bigram_p, lambdas, epsilon):
    correct = 0
    for index, line in enumerate(X_test):
        if predict(line, unigram_p, bigram_p, lambdas, epsilon) == Y_test[index]:
            correct += 1
    
    return correct / len(X_test)

## Hyperparameters #1:
$\lambda_3 = 0.8, \lambda_2 = 0.15, \lambda_1 = 0.02, \epsilon = 1e-03$


In [112]:
model_accuracy(X_test, Y_test, unigram_p, bigram_p, (0.8, 0.15, 0.05), 1e-3)

0.8393895348837209

In [126]:
for lambdas in [(0.5, 0.48, 0.02), (0.6, 0.38, 0.02), (0.7, 0.28, 0.02), (0.8, 0.18, 0.02), (0.9, 0.08, 0.02)]:
# for lambdas in [(0.8, 0.15, 0.05), (0.8, 0.16, 0.04), (0.8, 0.17, 0.03), (0.8, 0.18, 0.02), (0.8, 0.19, 0.01)]:
# for lambdas in [(0.8, 0.18, 0.02), (0.82, 0.16, 0.02), (0.84, 0.14, 0.02), (0.86, 0.12, 0.02), (0.88, 0.10, 0.02), (0.9, 0.08, 0.02)]:
    for epsilon in [1, 3e-1, 1e-2, 3e-2, 1e-2, 3e-3, 1e-3, 3e-4, 1e-4]:
#     for epsilon in [3e-4, 1e-4, 3e-5, 1e-5, 3e-6, 1e-6]:
#     for epsilon in [1e-5, 3e-6]:
        acc = model_accuracy(X_test, Y_test, unigram_p, bigram_p, lambdas, epsilon)
        print(f"Lambdas: {lambdas}, Epsilon: {epsilon}\t\t, Accuracy: {acc}")

Lambdas: (0.5, 0.48, 0.02), Epsilon: 1		, Accuracy: 0.7285610465116279
Lambdas: (0.5, 0.48, 0.02), Epsilon: 0.3		, Accuracy: 0.7605377906976745
Lambdas: (0.5, 0.48, 0.02), Epsilon: 0.01		, Accuracy: 0.840843023255814
Lambdas: (0.5, 0.48, 0.02), Epsilon: 0.03		, Accuracy: 0.8201308139534884
Lambdas: (0.5, 0.48, 0.02), Epsilon: 0.01		, Accuracy: 0.840843023255814
Lambdas: (0.5, 0.48, 0.02), Epsilon: 0.003		, Accuracy: 0.8499273255813954
Lambdas: (0.5, 0.48, 0.02), Epsilon: 0.001		, Accuracy: 0.862281976744186
Lambdas: (0.5, 0.48, 0.02), Epsilon: 0.0003		, Accuracy: 0.8659156976744186
Lambdas: (0.5, 0.48, 0.02), Epsilon: 0.0001		, Accuracy: 0.8677325581395349
Lambdas: (0.6, 0.38, 0.02), Epsilon: 1		, Accuracy: 0.7300145348837209
Lambdas: (0.6, 0.38, 0.02), Epsilon: 0.3		, Accuracy: 0.7579941860465116
Lambdas: (0.6, 0.38, 0.02), Epsilon: 0.01		, Accuracy: 0.8375726744186046
Lambdas: (0.6, 0.38, 0.02), Epsilon: 0.03		, Accuracy: 0.8132267441860465
Lambdas: (0.6, 0.38, 0.02), Epsilon: 0.01		

## Hyperparameters #2:
$\lambda_3 = 0.8, \lambda_2 = 0.18, \lambda_1 = 0.02, \epsilon = 1e-05$

In [125]:
model_accuracy(X_test, Y_test, unigram_p, bigram_p, (0.8, 0.18, 0.02), 1e-05)

0.873546511627907