In [2]:
import numpy as np
import transformers

In [2]:
set_o_consonants = {
    'ক', 'খ', 'গ', 'ঘ', 'ঙ',
    'চ', 'ছ', 'জ', 'ঝ', 'ঞ',
    'ট', 'ঠ', 'ড', 'ঢ', 'ণ',
    'ত', 'থ', 'দ', 'ধ', 'ন',
    'প', 'ফ', 'ব', 'ভ', 'ম',
    'য', 'য়', 'র', 'ল',
    'শ', 'ষ', 'স', 'হ',
    'ক়', 'খ়', 'গ়', 'জ়', 'ফ়'
}
# set_o_punctuations = {
#     '!', '।', '\n', ',', '-', '(', ')', '?', '.'
# }
set_o_vowels = {
    'অ', 'আ', 'ই', 'ঈ', 'উ', 'ঊ', 'ঋ', 'ৠ', 'ঌ', 'ৡ', 'এ', 'ঐ', 'ও', 'ঔ',
    'ং', 'ঁ', 'ঃ','ৎ', 'ু', 'ূ', 'ৃ', 'ে', 'ৈ', ' ো', ' ৌ', 'া', 'ি', 'ী'
}
set_o_special_tokens = {"[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"}
set_o_independent_vowels = {'অ', 'আ', 'ই', 'ঈ', 'উ', 'ঊ', 'ঋ', 'ৠ', 'ঌ', 'ৡ', 'এ', 'ঐ', 'ও', 'ঔ'}
set_o_dependent_vowels = { 'ং', 'ঁ', 'ঃ','ৎ', 'ু', 'ূ', 'ৃ', 'ে', 'ৈ', ' ো', ' ৌ', 'া', 'ি', 'ী'}

In [3]:
tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-multilingual-cased")

def mBERT_tokenize_text(line, max_len):
    # M-BERT tokenization
    token_nums = tokenizer.encode(line, truncation = True, max_length = max_len, add_special_tokens=True)
    tokenized_line = [tokenizer.convert_ids_to_tokens(token_id) for token_id in token_nums]
    return tokenized_line

In [4]:
with open('bn_100.txt', "r", errors = 'ignore') as f:
    n_lines = len(f.readlines())
max_len = 1000
token_list_1k = []
with open('bn_100.txt', "r", errors = 'ignore') as f:
    for i in range(n_lines):
        line = f.readline()
        token_list_1k.append(mBERT_tokenize_text(line, max_len))

In [5]:
len(token_list_1k)

339479

In [6]:
print(token_list_1k[0])

['[CLS]', 'গ', '##্রে', '##প্ত', '##ার', '##ক', '##ৃত', '##রা', 'হলো', ',', 'ক', '##ু', '##মি', '##ল', '##্', '##লা', 'জেলার', 'সদর', 'ক', '##ো', '##ত', '##য়া', '##লী', 'মডেল', 'থ', '##ানা', '##র', 'শ', '##াস', '##ন', '##গ', '##া', '##ছ', '##া', 'প', '##াল', '##প', '##াড়া', '[UNK]', 'স', '[UNK]', 'ম', '##িল', '##স', 'এলাকা', '##র', 'ই', '##উ', '##ন', '##ু', '##স', 'ম', '##িয়ার', 'ছ', '##েলে', 'ম', '##হ', '##াস', '##ীন', '(', '২৮', ')', ',', 'ম', '##ৃত', 'বা', '##হার', 'ম', '##িয়ার', 'স', '##্ত্রী', 'ফ', '##াতে', '##মা', 'ব', '##ে', '##গ', '##ম', 'আল', '##োন', '##ী', '(', '৪', '##২', ')', '।', '[SEP]']


In [7]:
def find_unigram_freq(corpus):
    uni_dict = {}
    for i in range(len(corpus)):
        line = corpus[i]
        for j in range(len(line)):
            if line[j] != ' ':
                if line[j] in uni_dict.keys():
                    uni_dict[line[j]] = uni_dict[line[j]] + 1
                else:
                    uni_dict[line[j]] = 0

    keys_list = list(uni_dict.keys())
    values_list = list(uni_dict.values())
    sorted_value_index = np.argsort(values_list)
    sorted_char_dict = {keys_list[i]: values_list[i] for i in sorted_value_index}

    return (keys_list, sorted_value_index)


In [17]:
# Uni-gram Frequency of the Characters
( keys_list, sorted_value_index ) = find_unigram_freq(token_list_1k)
print('The Top-20 Frequent Tokens are: ')
for i in sorted_value_index[::-1][:20]:
    print(keys_list[i], end = '  ')

The Top-20 Frequent Tokens are: 
।  ##র  [CLS]  [SEP]  ##ি  ##ে  স  ব  ##া  ##ন  প  ##ু  ,  ক  ##ম  ##ো  ##্  আ  ##ের  ##ক  

In [9]:
def form_bigram_freq(corpus):
    bigram_dict = {}
    for i in range(len(corpus)):
        line = corpus[i]
        for j in range(len(line) - 1):
            if line[j] != ' ' and line[j+1] != ' ' and line[j] != '#':
                if (line[j], line[j+1]) in bigram_dict.keys():
                    bigram_dict[(line[j], line[j+1])] = bigram_dict[(line[j], line[j+1])] + 1
                else:
                    bigram_dict[(line[j], line[j+1])] = 0

    keys_list = list(bigram_dict.keys())
    values_list = list(bigram_dict.values())
    sorted_value_index = np.argsort(values_list)
    sorted_char_dict = {keys_list[i]: values_list[i] for i in sorted_value_index}

    return (keys_list, sorted_value_index)

In [10]:
 # Bi-gram Frequency of the Characters
( keys_list, sorted_value_index ) = form_bigram_freq(token_list_1k)
print('The Top-20 Bi-gram Tokens are: ')
for i in sorted_value_index[::-1][:20]:
    print(keys_list[i], end = ' ')

The Top-20 Bi-gram Tokens are: 
('।', '[SEP]') ('প', '##্র') ('ব', '##ি') ('স', '##ম') ('##ম', '##্') ('ক', '##র') ('##ন', ',') ('স', '##ং') ('ন', '##ির') ('##্', '##প') ('পর', '##ি') ('দ', '##ে') ('##চ', '##্') ('প', '##া') ('ম', '##ু') ('##্', '##ষ') ('##া', '##ঁ') ('##ন', '।') ('জ', '##ানা') ('.', '.') 

In [11]:
# Splitting sentence into syllables
def syllable_form_from_line(text):

    syllable_list = []
    i = 0

    while i < len(text):

        n = len(text[i])
        if n == 1:
            syllable_list.append(text[i])
            i = i+ 1
        elif text[i] in set_o_special_tokens:
            i = i + 1
        else:
            j = 0
            while j<len(text[i]):
                syllable = ''
                while text[i][j] == '#' and (j+1)<len(text[i]):
                    j = j + 1
                if text[i][j] in set_o_independent_vowels:
                    syllable = syllable + text[i][j]
                elif text[i][j] in set_o_consonants:
                    syllable = syllable + text[i][j]
                    if (j+1) < len(text[i]) :
                        if text[i][j+1] in set_o_consonants or text[i][j+1] == 'অ':
                            syllable = syllable + text[i][j+1]
                            j = j + 1
                j = j + 1
                while j < len(text[i]) and text[i][j] in set_o_dependent_vowels:
                    syllable = syllable + text[i][j]
                    j = j + 1
                if syllable != '':
                    syllable_list.append(syllable)
            i = i + 1

    return syllable_list

def syllable_form_from_whole_corpus(unicode_corrected_corpus):
    """
    Args: A List of List of Unicode corrected Chars corresponding to each sentence, words are separated by ' '
    """
    syllable_corpus = []
    for i in range(len(unicode_corrected_corpus)):
        syllable_corpus.append(syllable_form_from_line(unicode_corrected_corpus[i]))
    return syllable_corpus

In [12]:
syllable_corpus = syllable_form_from_whole_corpus( token_list_1k )

In [13]:
# Bi-gram Frequency Syllables
( keys_list, sorted_value_index ) = form_bigram_freq(syllable_corpus)
print('The top 20 Frequency Syllable:')
for i in sorted_value_index[::-1][:20]:
    print(keys_list[i], end = ' ')

The top 20 Frequency Syllable:
('য', 'ে') ('য', 'া') ('প', 'র') ('ক', 'র') ('র', 'স') ('র', 'প') ('র', 'ক') ('ত', 'র') ('র', 'ব') ('দে', 'র') ('ন', 'ত') ('ন', 'র') ('স', 'ম') ('ন', '।') ('র', 'ম') ('ে', 'ছে') ('ও', 'য') ('এ', 'ক') ('দ', 'র') ('র', 'র') 

In [18]:
def form_char_corpus_from_token(token_list_1k):

    char_corpus = []
    for i in range(len(token_list_1k)):
        char_line_corpus = []
        for j in range(len(token_list_1k[i])):
            if token_list_1k[i][j] not in set_o_special_tokens:
                text = token_list_1k[i][j].replace('##', '')
                char_line_corpus += list(text)
        char_corpus.append(char_line_corpus)

    return char_corpus

char_corpus = form_char_corpus_from_token(token_list_1k)

In [19]:
# Bi-gram Frequency Character
( keys_list, sorted_value_index ) = form_bigram_freq(char_corpus)
print('The top 20 Frequent Characters:')
for i in sorted_value_index[::-1][:20]:
    print(keys_list[i], end = ' ')

The top 20 Frequent Characters:
('য', '়') ('া', 'র') ('ে', 'র') ('্', 'র') ('া', 'ন') ('্', 'য') ('র', 'া') ('র', 'ে') ('ন', '্') ('ে', 'ন') ('়', 'ে') ('ব', 'া') ('র', '্') ('ক', 'র') ('ক', 'া') ('ত', 'া') ('ন', 'া') ('্', 'ত') ('ন', 'ি') ('ল', 'ে') 

## Running mBERT for Max Length of 2k

In [21]:
token_list_2k = []
max_len = 2000
with open('bn_100.txt', "r", errors = 'ignore') as f:
    for i in range(n_lines):
        line = f.readline()
        token_list_2k.append(mBERT_tokenize_text(line, max_len))


In [22]:
# Uni-gram tokens
( keys_list, sorted_value_index ) = find_unigram_freq(token_list_2k)
print('The Top-20 Frequent Tokens are: ')
for i in sorted_value_index[::-1][:20]:
    print(keys_list[i], end = '  ')

The Top-20 Frequent Tokens are: 
।  ##র  [CLS]  [SEP]  ##ি  ##ে  স  ব  ##া  ##ন  প  ##ু  ,  ক  ##ম  ##ো  ##্  আ  ##ের  ##ক  

In [23]:
# Bi-gram tokens
( keys_list, sorted_value_index ) = form_bigram_freq(token_list_2k)
print('The Top-20 Bi-gram Tokens are: ')
for i in sorted_value_index[::-1][:20]:
    print(keys_list[i], end = ' ')

The Top-20 Bi-gram Tokens are: 
('।', '[SEP]') ('প', '##্র') ('ব', '##ি') ('স', '##ম') ('##ম', '##্') ('ক', '##র') ('##ন', ',') ('স', '##ং') ('##্', '##প') ('ন', '##ির') ('পর', '##ি') ('দ', '##ে') ('##চ', '##্') ('প', '##া') ('ম', '##ু') ('##্', '##ষ') ('##া', '##ঁ') ('##ন', '।') ('জ', '##ানা') ('.', '.') 

In [24]:
syllable_corpus = syllable_form_from_whole_corpus( token_list_2k )

In [25]:
# Bi-gram Syllables
( keys_list, sorted_value_index ) = form_bigram_freq(syllable_corpus)
print('The top 20 Bi-gram Frequency Syllable:')
for i in sorted_value_index[::-1][:20]:
    print(keys_list[i], end = ' ')

The top 20 Bi-gram Frequency Syllable:
('য', 'ে') ('য', 'া') ('প', 'র') ('ক', 'র') ('র', 'স') ('র', 'প') ('র', 'ক') ('ত', 'র') ('র', 'ব') ('দে', 'র') ('ন', 'ত') ('ন', 'র') ('স', 'ম') ('ন', '।') ('র', 'ম') ('ে', 'ছে') ('ও', 'য') ('এ', 'ক') ('দ', 'র') ('র', 'র') 

In [26]:
char_corpus = form_char_corpus_from_token(token_list_2k)

In [27]:
# Bi-gram Char
( keys_list, sorted_value_index ) = form_bigram_freq(char_corpus)
print('The top 20 Frequent Characters:')
for i in sorted_value_index[::-1][:20]:
    print(keys_list[i], end = ' ')

The top 20 Frequent Characters:
('য', '়') ('া', 'র') ('ে', 'র') ('্', 'র') ('া', 'ন') ('্', 'য') ('র', 'া') ('র', 'ে') ('ন', '্') ('ে', 'ন') ('়', 'ে') ('ব', 'া') ('র', '্') ('ক', 'র') ('ক', 'া') ('ত', 'া') ('ন', 'া') ('্', 'ত') ('ন', 'ি') ('ল', 'ে') 

## Question 5: (Tokenization)

In [9]:
# Tokenizing the earlier given corpus by mBERT Tokenizer
with open('cs689_assignment.txt', 'r') as f:
    n_lines = len(f.readlines())
max_len = 1000
token_list_1k = []
with open('cs689_assignment.txt', 'r') as f:
    for i in range(n_lines):
            line = f.readline()
            if i % 3 == 0: 
                token_list_1k.append(mBERT_tokenize_text(line, max_len))

In [10]:
print(token_list_1k[0])

['[CLS]', '1', '.', 'কিন্তু', 'সে', '##টার', 'গ', '##ায়', '##ে', 'হ', '##াত', 'দিয়ে', 'তিনি', 'আ', '##ঁ', '##ত', '##কে', 'ও', '##ঠে', '##ন', '।', '[SEP]']


In [11]:
# Calculating the precision, recall and F1 Score
def metric_per_line(output_token, ground_truth_token):
    output_token = set(output_token)
    ground_truth_token = set(ground_truth_token)
    true_positive = len( output_token & ground_truth_token)
    false_positive = len( output_token - ground_truth_token)
    false_negative = len(ground_truth_token - output_token)

    if (true_positive + false_positive) > 0:
        precision = true_positive / ( true_positive + false_positive )
    else:
        precision = 0
    if ( true_positive + false_negative) > 0:
        recall = true_positive / (true_positive + false_negative)
    else:
        recall = 0
    if precision > 0 and recall > 0:
        f1_score = 2 / ((1/precision) + (1 / recall))
    else:
        f1_score = 0

    return (precision, recall, f1_score)

def mean_metric_calculate(output_tokens, ground_truth_txt_filename = 'cs689_assignment.txt'):

    n_lines = len(output_tokens) # Num of Lines
    (sum_precision, sum_recall, sum_f1_score) = (0, 0, 0)
    with open(ground_truth_txt_filename, 'r') as f:
        for i in range(n_lines):
            temp = f.readline() # Removing the text line
            ground_truth = f.readline()  # Reading the even number sentences of the labels
            temp = f.readline()
            (t_p, t_r, t_f) = metric_per_line(output_tokens[i], ground_truth)
            sum_precision += t_p
            sum_recall += t_r
            sum_f1_score += t_f
    (mean_precision, mean_recall, mean_f1_score) = (sum_precision / n_lines, sum_recall / n_lines, sum_f1_score / n_lines)

    return (mean_precision, mean_recall, mean_f1_score)
                        

In [12]:
(preision, recall, f1_score) = mean_metric_calculate(token_list_1k)

In [13]:
print(f'Precision: {preision}, Recall: {recall}, F1 Score: {f1_score}' )

Precision: 0.16535584613130255, Recall: 0.19240626222627186, F1 Score: 0.17692844282523296
