In [1]:
import numpy as np

In [9]:
set_o_consonants = {
    'ক', 'খ', 'গ', 'ঘ', 'ঙ',
    'চ', 'ছ', 'জ', 'ঝ', 'ঞ',
    'ট', 'ঠ', 'ড', 'ঢ', 'ণ',
    'ত', 'থ', 'দ', 'ধ', 'ন',
    'প', 'ফ', 'ব', 'ভ', 'ম',
    'য', 'য়', 'র', 'ল',
    'শ', 'ষ', 'স', 'হ',
    'ক়', 'খ়', 'গ়', 'জ়', 'ফ়'
}
# set_o_punctuations = {
#     '!', '।', '\n', ',', '-', '(', ')', '?', '.'
# }
set_o_vowels = {
    'অ', 'আ', 'ই', 'ঈ', 'উ', 'ঊ', 'ঋ', 'ৠ', 'ঌ', 'ৡ', 'এ', 'ঐ', 'ও', 'ঔ',
    'ং', 'ঁ', 'ঃ','ৎ', 'ু', 'ূ', 'ৃ', 'ে', 'ৈ', ' ো', ' ৌ', 'া', 'ি', 'ী'
}
set_o_special_tokens = {"[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"}
set_o_independent_vowels = {'অ', 'আ', 'ই', 'ঈ', 'উ', 'ঊ', 'ঋ', 'ৠ', 'ঌ', 'ৡ', 'এ', 'ঐ', 'ও', 'ঔ'}
set_o_dependent_vowels = { 'ং', 'ঁ', 'ঃ','ৎ', 'ু', 'ূ', 'ৃ', 'ে', 'ৈ', ' ো', ' ৌ', 'া', 'ি', 'ী'}

In [2]:
def white_space_tokenizer(text):
    return text.split(' ') #Splitting Based Upon White Space

In [3]:
with open('bn_100.txt', "r", errors = 'ignore') as f:
    n_lines = len(f.readlines())
    
def find_unigram_freq(corpus):
    # Find Uni-gram Frequency of the elements of the corpus
    uni_dict = {}
    for i in range(len(corpus)):
        line = corpus[i]
        for j in range(len(line)):
            if line[j] != ' ':
                if line[j] in uni_dict.keys():
                    uni_dict[line[j]] = uni_dict[line[j]] + 1
                else:
                    uni_dict[line[j]] = 0

    keys_list = list(uni_dict.keys())
    values_list = list(uni_dict.values())
    sorted_value_index = np.argsort(values_list)
    sorted_char_dict = {keys_list[i]: values_list[i] for i in sorted_value_index}

    return (keys_list, sorted_value_index)
    
def form_bigram_freq(corpus):
    # Find Bi-gram Frequency of the elements of the corpus
    bigram_dict = {}
    for i in range(len(corpus)):
        line = corpus[i]
        for j in range(len(line) - 1):
            if line[j] != ' ' and line[j+1] != ' ' and line[j] != '#':
                if (line[j], line[j+1]) in bigram_dict.keys():
                    bigram_dict[(line[j], line[j+1])] = bigram_dict[(line[j], line[j+1])] + 1
                else:
                    bigram_dict[(line[j], line[j+1])] = 0

    keys_list = list(bigram_dict.keys())
    values_list = list(bigram_dict.values())
    sorted_value_index = np.argsort(values_list)
    sorted_char_dict = {keys_list[i]: values_list[i] for i in sorted_value_index}

    return (keys_list, sorted_value_index)

def syllable_form_from_line(text):

    #Split the text into Syllable
    
    syllable_list = []
    i = 0

    while i < len(text):

        n = len(text[i])
        if n == 1:
            syllable_list.append(text[i])
            i = i+ 1
        else:
            j = 0
            while j<len(text[i]):
                syllable = ''
                while text[i][j] == '#' and (j+1)<len(text[i]):
                    j = j + 1
                if text[i][j] in set_o_independent_vowels:
                    syllable = syllable + text[i][j]
                elif text[i][j] in set_o_consonants:
                    syllable = syllable + text[i][j]
                    if (j+1) < len(text[i]) :
                        if text[i][j+1] in set_o_consonants or text[i][j+1] == 'অ':
                            syllable = syllable + text[i][j+1]
                            j = j + 1
                j = j + 1
                while j < len(text[i]) and text[i][j] in set_o_dependent_vowels:
                    syllable = syllable + text[i][j]
                    j = j + 1
                if syllable != '':
                    syllable_list.append(syllable)
            i = i + 1

    return syllable_list

def syllable_form_from_whole_corpus(unicode_corrected_corpus):
    """
    Args: A List of List of Unicode corrected Chars corresponding to each sentence, words are separated by ' '
    """
    syllable_corpus = []
    for i in range(len(unicode_corrected_corpus)):
        syllable_corpus.append(syllable_form_from_line(unicode_corrected_corpus[i]))
    return syllable_corpus
    
def form_char_corpus_from_token(token_list_1k):

    char_corpus = []
    for i in range(len(token_list_1k)):
        char_line_corpus = []
        for j in range(len(token_list_1k[i])):
            if token_list_1k[i][j] not in set_o_special_tokens:
                text = token_list_1k[i][j].replace('▁', '')
                char_line_corpus += list(text)
        char_corpus.append(char_line_corpus)

    return char_corpus

In [6]:
token_list_corpus = []
max_len = 1000
with open('bn_100.txt', "r", errors = 'ignore') as f:
    for i in range(n_lines):
        line = f.readline()
        token_list_corpus.append(white_space_tokenizer(line))
# Uni-gram Tokens
( keys_list, sorted_value_index ) = find_unigram_freq(token_list_corpus)
print('The Top-20 Frequent Tokens are: ')
for i in sorted_value_index[::-1][:20]:
    print(keys_list[i], end = '  ')

The Top-20 Frequent Tokens are: 
ও  করে  এই  থেকে  এ  এবং  করা  তার  না  এক  তিনি  জন্য  আর  নিয়ে  করতে  একটি  সঙ্গে  যে  কিন্তু  বলেন,  

In [7]:
# Bi-Gram Tokens
( keys_list, sorted_value_index ) = form_bigram_freq(token_list_corpus)
print('The Top-20 Frequent Tokens are: ')
for i in sorted_value_index[::-1][:20]:
    print(keys_list[i], end = '  ')

The Top-20 Frequent Tokens are: 
('', '')  ('.', '.')  ('তিনি', 'বলেন,')  ('করা', 'হয়।\n')  ('করা', 'হয়েছে।\n')  ('সাধারণ', 'সম্পাদক')  ('আওয়ামী', 'লীগের')  ('এ', 'সময়')  ('শেষ', 'আপডেট:')  ('করা', 'হয়।')  ('করা', 'হয়েছে।')  ('এর', 'আগে')  ('করার', 'জন্য')  ('কাছ', 'থেকে')  ('করা', 'হয়েছে')  ('প্রধানমন্ত্রী', 'শেখ')  ('পক্ষ', 'থেকে')  ('শুরু', 'করে')  ('আওয়ামী', 'লীগ')  ('এ', 'বিষয়ে')  

In [10]:
syllable_corpus = syllable_form_from_whole_corpus( token_list_corpus )

In [11]:
# Bi-gram Syllables
( keys_list, sorted_value_index ) = form_bigram_freq(syllable_corpus)
print('The top 20 Bi-gram Frequency Syllable:')
for i in sorted_value_index[::-1][:20]:
    print(keys_list[i], end = ' ')

The top 20 Bi-gram Frequency Syllable:
('দে', 'র') ('ে', 'ছে') ('ও', 'া') ('হ', 'ে') ('ছে', 'ন') ('রে', 'র') ('থে', 'কে') ('র', 'আ') ('এ', 'ই') ('নে', 'র') ('কা', 'র') ('বা', 'র') ('আ', 'র') ('র', 'এ') ('তা', 'র') ('া', 'র') ('র', 'প') ('র', 'অ') ('নি', 'ে') ('র', 'বি') 

In [12]:
char_corpus = form_char_corpus_from_token(token_list_corpus)

In [13]:
# Bi-gram Characters
( keys_list, sorted_value_index ) = form_bigram_freq(char_corpus)
print('The top 20 Frequent Characters:')
for i in sorted_value_index[::-1][:20]:
    print(keys_list[i], end = ' ')

The top 20 Frequent Characters:
('া', 'র') ('ে', 'র') ('্', 'র') ('া', 'ন') ('্', 'য') ('র', 'া') ('র', 'ে') ('ন', '্') ('ে', 'ন') ('ব', 'া') ('।', '\n') ('র', '্') ('ক', 'র') ('ক', 'া') ('ত', 'া') ('ন', 'া') ('্', 'ত') ('ল', 'ে') ('ন', 'ি') ('ক', 'ে') 

## Question 5

In [21]:
# Tokenizing the earlier given corpus by mBERT Tokenizer
with open('cs689_assignment.txt', 'r') as f:
    n_lines = len(f.readlines())
max_len = 1000
token_list_1k = []
with open('cs689_assignment.txt', 'r') as f:
    for i in range(n_lines):
        line = f.readline()
        if i % 3 == 0:
            token_list_1k.append(white_space_tokenizer( line ))

In [26]:
print(token_list_1k[4])

['5.', 'স্থির', 'হয়,', 'কোনও', 'ক্ষেত্রে', 'মতবিরোধ', 'থাকলে', 'উভয়পক্ষই', 'সংযম', 'দেখাবে।\n']


In [37]:
# Calculating the precision, recall and F1 Score
def metric_per_line(output_token, ground_truth_token):
    output_token = set(output_token)
    ground_truth_token = set(ground_truth_token)
    true_positive = len( output_token & ground_truth_token)
    false_positive = len( output_token - ground_truth_token)
    false_negative = len(ground_truth_token - output_token)

    if (true_positive + false_positive) > 0:
        precision = true_positive / ( true_positive + false_positive )
    else:
        precision = 0
    if ( true_positive + false_negative) > 0:
        recall = true_positive / (true_positive + false_negative)
    else:
        recall = 0
    if precision > 0 and recall > 0:
        f1_score = 2 / ((1/precision) + (1 / recall))
    else:
        f1_score = 0

    return (precision, recall, f1_score)

def mean_metric_calculate(output_tokens, ground_truth_txt_filename = 'cs689_assignment.txt'):

    n_lines = len(output_tokens) # Num of Lines
    (sum_precision, sum_recall, sum_f1_score) = (0, 0, 0)
    with open(ground_truth_txt_filename, 'r') as f:
        for i in range(n_lines):
            temp = f.readline() # Removing the text line
            ground_truth = f.readline()  # Reading the even number sentences of the labels
            temp = f.readline() # Removing Exra-ine
            # print(ground_truth)
            # print(type
            (t_p, t_r, t_f) = metric_per_line(output_tokens[i], ground_truth)
            sum_precision += t_p
            sum_recall += t_r
            sum_f1_score += t_f
    (mean_precision, mean_recall, mean_f1_score) = (sum_precision / n_lines, sum_recall / n_lines, sum_f1_score / n_lines)

    return (mean_precision, mean_recall, mean_f1_score)
                        

In [38]:
(preision, recall, f1_score) = mean_metric_calculate(token_list_1k)

In [39]:
print(f'Precision: {preision}, Recall: {recall}, F1 Score: {f1_score}' )

Precision: 0.032878510378510376, Recall: 0.015375640161614917, F1 Score: 0.0208486781247428
