# 1. Training with train set

In [1]:
def read_poem(file_name):
    with open(file_name, 'r', encoding="utf-8") as f:
        return f.read().splitlines()

ferdowsi_lines = read_poem('train_set/ferdowsi_train.txt')
hafez_lines = read_poem('train_set/hafez_train.txt')
molana_lines = read_poem('train_set/molavi_train.txt')

In [2]:
print(f"Ferdowsi length: {len(ferdowsi_lines)}")
print(f"Hafez length: {len(hafez_lines)}")
print(f"Molana length: {len(molana_lines)}")

Ferdowsi length: 9000
Hafez length: 7700
Molana length: 8000


In [3]:
all_lines = ferdowsi_lines + hafez_lines + molana_lines
p_ferdowsdi = len(ferdowsi_lines) / len(all_lines)
p_hafez = len(hafez_lines) / len(all_lines)
p_molana = len(molana_lines) / len(all_lines)
lang_probabilities = [p_ferdowsdi, p_hafez, p_molana]

In [4]:
print(f"Ferdowsi probability: {p_ferdowsdi}")
print(f"Hafez probability: {p_hafez}")
print(f"Molana probability: {p_molana}")

Ferdowsi probability: 0.3643724696356275
Hafez probability: 0.3117408906882591
Molana probability: 0.32388663967611336


In [5]:
all_lines_with_token = [f"<S> {l} </S>" for l in all_lines]
ferdowsi_lines_with_token = [f"<S> {l} </S>" for l in ferdowsi_lines]
hafez_lines_with_token = [f"<S> {l} </S>" for l in hafez_lines]
molana_lines_with_token = [f"<S> {l} </S>" for l in molana_lines]

In [6]:
print(all_lines_with_token[:3])
print(ferdowsi_lines_with_token[:3])
print(hafez_lines_with_token[:3])
print(molana_lines_with_token[:3])

['<S> جهان چون بزاری برآید همی </S>', '<S> بدو نیک روزی سرآید همی </S>', '<S> چو بستی کمر بر در راه آز </S>']
['<S> جهان چون بزاری برآید همی </S>', '<S> بدو نیک روزی سرآید همی </S>', '<S> چو بستی کمر بر در راه آز </S>']
['<S> الا یا ایها الساقی ادر کاسا و ناولها </S>', '<S> که عشق آسان نمود اول ولی افتاد مشکل\u200cها </S>', '<S> به بوی نافه\u200cای کاخر صبا زان طره بگشاید </S>']
['<S> بشنو از نی ، چون حکایت می\u200cکند </S>', '<S> واز جدائی\u200cها شکایت می\u200cکند </S>', '<S> کز نیستان تا مرا ببریده اند </S>']


In [7]:
all_words = " ".join(all_lines_with_token).split(" ")
ferdowsi_words = " ".join(ferdowsi_lines_with_token).split(" ")
hafez_words = " ".join(hafez_lines_with_token).split(" ")
molana_words = " ".join(molana_lines_with_token).split(" ")

In [8]:
print(all_words[:10])
print(ferdowsi_words[:10])
print(hafez_words[:10])
print(molana_words[:10])

['<S>', 'جهان', 'چون', 'بزاری', 'برآید', 'همی', '</S>', '<S>', 'بدو', 'نیک']
['<S>', 'جهان', 'چون', 'بزاری', 'برآید', 'همی', '</S>', '<S>', 'بدو', 'نیک']
['<S>', 'الا', 'یا', 'ایها', 'الساقی', 'ادر', 'کاسا', 'و', 'ناولها', '</S>']
['<S>', 'بشنو', 'از', 'نی', '،', 'چون', 'حکایت', 'می\u200cکند', '</S>', '<S>']


In [9]:
all_words_set = set(all_words)
print(len(all_words_set))

17158


## 1.1 Creating dictionary

In [10]:
dictionary = [w for w in all_words_set if all_words.count(w) >= 2]
len(dictionary)

8075

## 1.2 Unigram model

In [11]:
# 0: ferdowsi, 1: hafez, 2: molana
unigram_p = [{}, {}, {}]
for w in dictionary:
    unigram_p[0][w] = ferdowsi_words.count(w) / len(ferdowsi_words)
    unigram_p[1][w] = hafez_words.count(w) / len(hafez_words)
    unigram_p[2][w] = molana_words.count(w) / len(molana_words)

## 1.3 Bigram model

In [12]:
def create_bigrams(lines):
    return [b for l in lines for b in zip(l.split(" ")[:-1], l.split(" ")[1:])]
    
# 0: ferdowsi, 1: hafez, 2: molana
bigrams = []
bigrams.append(create_bigrams(ferdowsi_lines_with_token))
bigrams.append(create_bigrams(hafez_lines_with_token))
bigrams.append(create_bigrams(molana_lines_with_token))

In [13]:
def calculate_bigram_probability(lines_with_token, bigram, dictionary):
    bigram_p = {}
    lines_joined = " ".join(lines_with_token)
    for couple in bigram:
        if (couple[0] in dictionary) and (couple[1] in dictionary) and (couple not in bigram_p):
            denominator = lines_joined.count(couple[0])
            if denominator != 0:
                bigram_p[couple] = lines_joined.count(" ".join(couple)) / denominator
    
    return bigram_p

bigram_p = []
bigram_p.append(calculate_bigram_probability(ferdowsi_lines_with_token, bigrams[0], dictionary))
bigram_p.append(calculate_bigram_probability(hafez_lines_with_token, bigrams[1], dictionary))
bigram_p.append(calculate_bigram_probability(molana_lines_with_token, bigrams[2], dictionary))

## 1.4 Backoff model

In [14]:
def backoff_model(line_with_token, unigram_p, bigram_p, lambdas, epsilon):
    assert abs(lambdas[0] + lambdas[1] + lambdas[2] - 1) < 1e-5
    
    splitted_line = line_with_token.split(" ")
    unigram = splitted_line
    bigram = [b for b in zip(splitted_line[:-1], splitted_line[1:])]
    
    probabilities = [p for p in lang_probabilities]
    for couple in bigram:
        for i in range(3):
            probabilities[i] *= lambdas[0] * bigram_p[i].get(couple, 0) + lambdas[1] * unigram_p[i].get(couple[1], 0) + lambdas[2] * epsilon
    
    return probabilities

# 2. Testing with test set

In [15]:
X_test = []
Y_test = []

with open("test_set/test_file.txt", 'r', encoding="utf-8") as f:
    for line in f.read().splitlines():
        y, x = line.strip().split("\t")
        X_test.append(f"<S> {x} </S>")
        Y_test.append(int(y) - 1)

In [16]:
print(X_test[:3])
print(Y_test[:3])

['<S> وزان جایگه نالهٔ گاودم </S>', '<S> شنیدند و آواز رویینه خم </S>', '<S> جهاندار بیدار لشکر براند </S>']
[0, 0, 0]


In [17]:
def arg_max(input_list):
    return max(zip(input_list, range(len(input_list))))[1]

In [18]:
def predict(line_with_token, unigram_p, bigram_p, lambdas, epsilon):
    probabilities = backoff_model(line_with_token, unigram_p, bigram_p, lambdas, epsilon)
    return arg_max(probabilities)

In [19]:
predict(X_test[0], unigram_p, bigram_p, (0.8, 0.15, 0.05), 1e-3)

0

In [20]:
def model_accuracy(X_test, Y_test, unigram_p, bigram_p, lambdas, epsilon):
    correct = 0
    for index, line in enumerate(X_test):
        if predict(line, unigram_p, bigram_p, lambdas, epsilon) == Y_test[index]:
            correct += 1
    
    return correct / len(X_test)

## Hyperparameters #1:
$\lambda_3 = 0.5, \lambda_2 = 0.48, \lambda_1 = 0.02, \epsilon = 0.01$

$Accuracy: 0.8404$

In [21]:
model_accuracy(X_test, Y_test, unigram_p, bigram_p, (0.5, 0.48, 0.02), 0.01)

0.8404796511627907

In [22]:
# for lambdas in [(0.5, 0.48, 0.02), (0.6, 0.38, 0.02), (0.7, 0.28, 0.02), (0.8, 0.18, 0.02), (0.9, 0.08, 0.02)]:
# for lambdas in [(0.8, 0.13, 0.07), (0.8, 0.14, 0.06), (0.8, 0.15, 0.05), (0.8, 0.16, 0.04), (0.8, 0.17, 0.03), (0.8, 0.18, 0.02), (0.8, 0.19, 0.01)]:
for lambdas in [(0.8, 0.15, 0.05), (0.82, 0.13, 0.05), (0.84, 0.11, 0.05), (0.86, 0.09, 0.05), (0.88, 0.07, 0.05), (0.9, 0.05, 0.05)]:
#     for epsilon in [1, 3e-1, 1e-2, 3e-2, 1e-2, 3e-3, 1e-3, 3e-4, 1e-4]:
#     for epsilon in [3e-4, 1e-4, 3e-5, 1e-5, 3e-6, 1e-6]:
    for epsilon in [1e-5, 3e-6]:
        acc = model_accuracy(X_test, Y_test, unigram_p, bigram_p, lambdas, epsilon)
        print(f"Lambdas: {lambdas}, Epsilon: {epsilon}\t\t, Accuracy: {acc}")

Lambdas: (0.8, 0.15, 0.05), Epsilon: 1e-05		, Accuracy: 0.8710029069767442
Lambdas: (0.8, 0.15, 0.05), Epsilon: 3e-06		, Accuracy: 0.873546511627907
Lambdas: (0.82, 0.13, 0.05), Epsilon: 1e-05		, Accuracy: 0.8699127906976745
Lambdas: (0.82, 0.13, 0.05), Epsilon: 3e-06		, Accuracy: 0.873546511627907
Lambdas: (0.84, 0.11, 0.05), Epsilon: 1e-05		, Accuracy: 0.8695494186046512
Lambdas: (0.84, 0.11, 0.05), Epsilon: 3e-06		, Accuracy: 0.8728197674418605
Lambdas: (0.86, 0.09, 0.05), Epsilon: 1e-05		, Accuracy: 0.8680959302325582
Lambdas: (0.86, 0.09, 0.05), Epsilon: 3e-06		, Accuracy: 0.8717296511627907
Lambdas: (0.88, 0.07, 0.05), Epsilon: 1e-05		, Accuracy: 0.8666424418604651
Lambdas: (0.88, 0.07, 0.05), Epsilon: 3e-06		, Accuracy: 0.8706395348837209
Lambdas: (0.9, 0.05, 0.05), Epsilon: 1e-05		, Accuracy: 0.8626453488372093
Lambdas: (0.9, 0.05, 0.05), Epsilon: 3e-06		, Accuracy: 0.8702761627906976


## Hyperparameters #2:
$\lambda_3 = 0.8, \lambda_2 = 0.15, \lambda_1 = 0.05, \epsilon = 0.01$

$Accuracy: 0.8077$

In [23]:
model_accuracy(X_test, Y_test, unigram_p, bigram_p, (0.8, 0.15, 0.05), 0.01)

0.8077761627906976

## Hyperparameters #3:
$\lambda_3 = 0.5, \lambda_2 = 0.48, \lambda_1 = 0.02, \epsilon = 3e-06$

$Accuracy: 0.8611$

In [24]:
model_accuracy(X_test, Y_test, unigram_p, bigram_p, (0.5, 0.48, 0.02), 3e-06)

0.8611918604651163

## Hyperparameters #4:
$\lambda_3 = 0.8, \lambda_2 = 0.15, \lambda_1 = 0.05, \epsilon = 3e-06$

$Accuracy: 0.8735$

In [25]:
model_accuracy(X_test, Y_test, unigram_p, bigram_p, (0.8, 0.15, 0.05), 3e-06)

0.873546511627907

به نظر می‌رسد بهترین پارامترها را [حالت چهارم](#Hyperparameters-%234%3A) هستند که در این حالت دقت مدل در حدود ۸۷.۳۵ درصد است که دقت مناسبی است.


علت مناسب بودن این پارامترها این است که در این حالت برای کلمه‌هایی که در واژه‌نامه وجود ندارند، تنها یک احتمال بسیار کوچک (برای صفر نشدن احتمال کل جمله) در نظر گرفته می‌شود. هم‌چنین در این حالت وزن بالایی به مدل بایگرام داده می‌شود. از آنجا که تقریب بایگرام تقریب دقیق‌تری است، وزن بیشتر این تقریب نسبت به تقریب یونی‌گرام، باعث بهتر شدن دقت مدل شده است.