In [2]:
import numpy as np
from transformers import AutoTokenizer


In [17]:
set_o_consonants = {
    'ক', 'খ', 'গ', 'ঘ', 'ঙ',
    'চ', 'ছ', 'জ', 'ঝ', 'ঞ',
    'ট', 'ঠ', 'ড', 'ঢ', 'ণ',
    'ত', 'থ', 'দ', 'ধ', 'ন',
    'প', 'ফ', 'ব', 'ভ', 'ম',
    'য', 'য়', 'র', 'ল',
    'শ', 'ষ', 'স', 'হ',
    'ক়', 'খ়', 'গ়', 'জ়', 'ফ়'
}
# set_o_punctuations = {
#     '!', '।', '\n', ',', '-', '(', ')', '?', '.'
# }
set_o_vowels = {
    'অ', 'আ', 'ই', 'ঈ', 'উ', 'ঊ', 'ঋ', 'ৠ', 'ঌ', 'ৡ', 'এ', 'ঐ', 'ও', 'ঔ',
    'ং', 'ঁ', 'ঃ','ৎ', 'ু', 'ূ', 'ৃ', 'ে', 'ৈ', ' ো', ' ৌ', 'া', 'ি', 'ী'
}
set_o_special_tokens = {"[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"}
set_o_independent_vowels = {'অ', 'আ', 'ই', 'ঈ', 'উ', 'ঊ', 'ঋ', 'ৠ', 'ঌ', 'ৡ', 'এ', 'ঐ', 'ও', 'ঔ'}
set_o_dependent_vowels = { 'ং', 'ঁ', 'ঃ','ৎ', 'ু', 'ূ', 'ৃ', 'ে', 'ৈ', ' ো', ' ৌ', 'া', 'ি', 'ী'}

In [22]:
with open('bn_100.txt', "r", errors = 'ignore') as f:
    n_lines = len(f.readlines())
    
def find_unigram_freq(corpus):
    # Finding the unigram frequency of each element in the corpus
    uni_dict = {}
    for i in range(len(corpus)):
        line = corpus[i]
        for j in range(len(line)):
            if line[j] != ' ': # Removing the space
                if line[j] in uni_dict.keys():
                    # Key already existed
                    uni_dict[line[j]] = uni_dict[line[j]] + 1
                else:
                    # New key added
                    uni_dict[line[j]] = 0

    keys_list = list(uni_dict.keys())
    values_list = list(uni_dict.values())
    sorted_value_index = np.argsort(values_list)
    sorted_char_dict = {keys_list[i]: values_list[i] for i in sorted_value_index}
    # Returning Key and Sorted Index based upon the Value of the frequency of the occurence of the character
    return (keys_list, sorted_value_index)
    
def form_bigram_freq(corpus):
    # Corpus is a list of list, which has a list corresponding to each line and all the lines are in another list
    bigram_dict = {}
    for i in range(len(corpus)):
        line = corpus[i]
        for j in range(len(line) - 1):
            if line[j] != ' ' and line[j+1] != ' ' and line[j] != '#':
                if (line[j], line[j+1]) in bigram_dict.keys():
                    # Increasing the count
                    bigram_dict[(line[j], line[j+1])] = bigram_dict[(line[j], line[j+1])] + 1
                else:
                    # New Key Found
                    bigram_dict[(line[j], line[j+1])] = 0

    keys_list = list(bigram_dict.keys())
    values_list = list(bigram_dict.values())
    sorted_value_index = np.argsort(values_list)
    sorted_char_dict = {keys_list[i]: values_list[i] for i in sorted_value_index}

    # Returning Key List and its Sorted index based upon the value
    return (keys_list, sorted_value_index)

def syllable_form_from_line(text):

    syllable_list = []
    i = 0

    while i < len(text):

        n = len(text[i])
        if n == 1:
            syllable_list.append(text[i])
            i = i+ 1
        else:
            j = 0
            while j<len(text[i]):
                syllable = ''
                while text[i][j] == '#' and (j+1)<len(text[i]):
                    # The Hash Char are avoided while calculating syllable
                    j = j + 1
                if text[i][j] in set_o_independent_vowels:
                    # Independent vowels will be a point of split for syllable
                    syllable = syllable + text[i][j]
                elif text[i][j] in set_o_consonants:
                    # Consonant after Vowel will start a new syllable
                    syllable = syllable + text[i][j]
                    if (j+1) < len(text[i]) :
                        if text[i][j+1] in set_o_consonants or text[i][j+1] == 'অ':
                            # The following consonants after a consonants or the 'অ'
                            # will remain in the same syllable
                            syllable = syllable + text[i][j+1]
                            j = j + 1
                j = j + 1
                while j < len(text[i]) and text[i][j] in set_o_dependent_vowels:
                    # Dependent Vowels will be added with previous syllable
                    syllable = syllable + text[i][j]
                    j = j + 1
                if syllable != '':
                    syllable_list.append(syllable)
            i = i + 1

    return syllable_list

def syllable_form_from_whole_corpus(unicode_corrected_corpus):
    """
    Args: A List of List of Unicode corrected Chars corresponding to each sentence, words are separated by ' '
    """
    syllable_corpus = []
    for i in range(len(unicode_corrected_corpus)):
        syllable_corpus.append(syllable_form_from_line(unicode_corrected_corpus[i]))
    return syllable_corpus
    
def form_char_corpus_from_token(token_list_1k):
    """
    The functuion splits every tokens into characters to find the frequency of the characters
    """
    char_corpus = []
    for i in range(len(token_list_1k)):
        char_line_corpus = []
        for j in range(len(token_list_1k[i])):
            if token_list_1k[i][j] not in set_o_special_tokens:
                # Ignoring pecial token and '_' token corresponding to space
                text = token_list_1k[i][j].replace('▁', '')
                char_line_corpus += list(text)
        char_corpus.append(char_line_corpus)

    return char_corpus

In [3]:

tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert") # IndicBERT Pre-tokenizer


In [4]:
def tokenize_text(tokenizer, text, max_length):
    # Function to tokenize the text
    token_ids = tokenizer.encode(text, max_length=max_length, truncation=True)
    tokenized_line = [tokenizer.convert_ids_to_tokens(token_id) for token_id in token_ids]
    return tokenized_line


In [15]:
token_list_corpus = []
max_len = 1000 # Maxlen = 1k
with open('bn_100.txt', "r", errors = 'ignore') as f:
    for i in range(n_lines):
        line = f.readline()
        # print(line)
        token_list_corpus.append(tokenize_text(tokenizer, line, max_length))
# Unigram Tokens
( keys_list, sorted_value_index ) = find_unigram_freq(token_list_corpus)
print('The Top-20 Frequent Uni-gram Tokens are: ')
for i in sorted_value_index[::-1][:20]:
    print(keys_list[i], end = '  ')

The Top-20 Frequent Tokens are: 
য  র  ▁  ।  [SEP]  [CLS]  ন  ক  ত  ▁পর  ,  ট  ষ  ▁স  ▁কর  ▁দ  ব  ▁ব  ল  ড  

In [16]:
# Bigram Tokens
( keys_list, sorted_value_index ) = form_bigram_freq(token_list_corpus)
print('The Top-20 Frequent Tokens are: ')
for i in sorted_value_index[::-1][:20]:
    print(keys_list[i], end = '  ')

The Top-20 Frequent Tokens are: 
('▁', '[SEP]')  ('।', '▁')  ('▁হ', 'য')  ('য', 'র')  ('▁থ', 'ক')  ('▁', 'র')  ('য', 'ন')  ('য', 'ছ')  ('▁ন', 'য')  ('য', '।')  ('ছ', '।')  ('▁ব', 'য')  ('ন', '।')  ('ন', ',')  ('▁জন', 'য')  ('র', '▁পর')  ('▁দ', 'য')  ('র', '▁')  ('ঙ', 'গ')  ('▁এক', 'ট')  

In [18]:
syllable_corpus = syllable_form_from_whole_corpus( token_list_corpus )

In [19]:
# Bi-gram Syllable
( keys_list, sorted_value_index ) = form_bigram_freq(syllable_corpus)
print('The top 20 Bi-gram Frequency Syllable:')
for i in sorted_value_index[::-1][:20]:
    print(keys_list[i], end = ' ')

The top 20 Bi-gram Frequency Syllable:
('।', '▁') ('য', 'র') ('হ', 'য') ('ন', 'য') ('য', 'ন') ('এ', 'ক') ('থ', 'ক') ('ন', '।') ('▁', 'র') ('ব', 'য') ('ছ', '।') ('ও', 'য') ('অ', 'ন') ('য', 'ছ') ('আ', 'ম') ('য', 'য') ('র', 'পর') ('দ', 'য') ('য', '।') ('ন', ',') 

In [23]:
char_corpus = form_char_corpus_from_token(token_list_corpus)

In [24]:
# Bi-gram Char
( keys_list, sorted_value_index ) = form_bigram_freq(char_corpus)
print('The top 20 Frequent Characters:')
for i in sorted_value_index[::-1][:20]:
    print(keys_list[i], end = ' ')

The top 20 Frequent Characters:
('ক', 'র') ('প', 'র') ('ত', 'র') ('র', 'ক') ('র', 'ব') ('র', 'স') ('র', 'ম') ('ব', 'র') ('ন', 'ত') ('র', 'ত') ('ন', 'য') ('ন', 'র') ('র', 'প') ('দ', 'র') ('ম', 'ন') ('জ', 'ন') ('র', 'র') ('র', 'ন') ('ব', 'ল') ('ন', 'ক') 

## IndicBERT with 2k Vocabulary

In [25]:
token_list_corpus = []
max_len = 2000
with open('bn_100.txt', "r", errors = 'ignore') as f:
    for i in range(n_lines):
        line = f.readline()
        # print(line)
        token_list_corpus.append(tokenize_text(tokenizer, line, max_length))
# Uni-gram Tokens
( keys_list, sorted_value_index ) = find_unigram_freq(token_list_corpus)
print('The Top-20 Frequent Tokens are: ')
for i in sorted_value_index[::-1][:20]:
    print(keys_list[i], end = '  ')

The Top-20 Frequent Tokens are: 
য  র  ▁  ।  [SEP]  [CLS]  ন  ক  ত  ▁পর  ,  ট  ষ  ▁স  ▁কর  ▁দ  ব  ▁ব  ল  ড  

In [26]:
# Bi-gram Tokens
( keys_list, sorted_value_index ) = form_bigram_freq(token_list_corpus)
print('The Top-20 Frequent Tokens are: ')
for i in sorted_value_index[::-1][:20]:
    print(keys_list[i], end = '  ')

The Top-20 Frequent Tokens are: 
('▁', '[SEP]')  ('।', '▁')  ('▁হ', 'য')  ('য', 'র')  ('▁থ', 'ক')  ('▁', 'র')  ('য', 'ন')  ('য', 'ছ')  ('▁ন', 'য')  ('য', '।')  ('ছ', '।')  ('▁ব', 'য')  ('ন', '।')  ('ন', ',')  ('▁জন', 'য')  ('র', '▁পর')  ('▁দ', 'য')  ('র', '▁')  ('ঙ', 'গ')  ('▁এক', 'ট')  

In [27]:
syllable_corpus = syllable_form_from_whole_corpus( token_list_corpus )

In [28]:
# Bi-gram Frequent Syllables
( keys_list, sorted_value_index ) = form_bigram_freq(syllable_corpus)
print('The top 20 Bi-gram Frequency Syllable:')
for i in sorted_value_index[::-1][:20]:
    print(keys_list[i], end = ' ')

The top 20 Bi-gram Frequency Syllable:
('।', '▁') ('য', 'র') ('হ', 'য') ('ন', 'য') ('য', 'ন') ('এ', 'ক') ('থ', 'ক') ('ন', '।') ('▁', 'র') ('ব', 'য') ('ছ', '।') ('ও', 'য') ('অ', 'ন') ('য', 'ছ') ('আ', 'ম') ('য', 'য') ('র', 'পর') ('দ', 'য') ('য', '।') ('ন', ',') 

In [29]:
char_corpus = form_char_corpus_from_token(token_list_corpus)

In [30]:
# Bi-gram Frequent Characters
( keys_list, sorted_value_index ) = form_bigram_freq(char_corpus)
print('The top 20 Frequent Characters:')
for i in sorted_value_index[::-1][:20]:
    print(keys_list[i], end = ' ')

The top 20 Frequent Characters:
('ক', 'র') ('প', 'র') ('ত', 'র') ('র', 'ক') ('র', 'ব') ('র', 'স') ('র', 'ম') ('ব', 'র') ('ন', 'ত') ('র', 'ত') ('ন', 'য') ('ন', 'র') ('র', 'প') ('দ', 'র') ('ম', 'ন') ('জ', 'ন') ('র', 'র') ('র', 'ন') ('ব', 'ল') ('ন', 'ক') 

## Question 5:

In [12]:
# Tokenizing the earlier given corpus by mBERT Tokenizer
with open('cs689_assignment.txt', 'r') as f:
    n_lines = len(f.readlines())
max_len = 1000
token_list_1k = []
with open('cs689_assignment.txt', 'r') as f:
    for i in range(n_lines):
            line = f.readline()
            if i % 3 == 0:
                token_list_1k.append(tokenize_text(tokenizer, line, max_len)) # Reading the sentence

In [13]:
print(token_list_1k[0])

['[CLS]', '▁1.', '▁কন', 'ত', '▁স', 'টর', '▁গ', 'য', '▁হত', '▁দ', 'য', '▁তন', '▁আত', 'ক', '▁ওঠ', 'ন', '।', '▁', '[SEP]']


In [14]:
# Calculating the precision, recall and F1 Score
def metric_per_line(output_token, ground_truth_token):
    output_token = set(output_token)
    ground_truth_token = set(ground_truth_token)
    true_positive = len( output_token & ground_truth_token)
    false_positive = len( output_token - ground_truth_token)
    false_negative = len(ground_truth_token - output_token)

    if (true_positive + false_positive) > 0:
        precision = true_positive / ( true_positive + false_positive )
    else:
        precision = 0
    if ( true_positive + false_negative) > 0:
        recall = true_positive / (true_positive + false_negative)
    else:
        recall = 0
    if precision > 0 and recall > 0:
        f1_score = 2 / ((1/precision) + (1 / recall))
    else:
        f1_score = 0

    return (precision, recall, f1_score)

def mean_metric_calculate(output_tokens, ground_truth_txt_filename = 'cs689_assignment.txt'):

    n_lines = len(output_tokens) # Num of Lines
    (sum_precision, sum_recall, sum_f1_score) = (0, 0, 0)
    with open(ground_truth_txt_filename, 'r') as f:
        for i in range(n_lines):
            temp = f.readline() # Removing the text line
            ground_truth = f.readline()  # Reading the even number sentences of the labels
            temp = f.readline()
            (t_p, t_r, t_f) = metric_per_line(output_tokens[i], ground_truth)
            sum_precision += t_p
            sum_recall += t_r
            sum_f1_score += t_f
    (mean_precision, mean_recall, mean_f1_score) = (sum_precision / n_lines, sum_recall / n_lines, sum_f1_score / n_lines)

    return (mean_precision, mean_recall, mean_f1_score)
                        

In [15]:
(preision, recall, f1_score) = mean_metric_calculate(token_list_1k)

In [16]:
print(f'Precision: {preision}, Recall: {recall}, F1 Score: {f1_score}' )

Precision: 0.23911910735306321, Recall: 0.22415086687487434, F1 Score: 0.2304098277473331
