In [1]:
import pandas as pd
import numpy as np 

import warnings
warnings.filterwarnings('ignore')



In [2]:
from collections import defaultdict
from typing import List

class Tokenizer:
    def __init__(self):
        self.vocab = set()
        self.vocab.add("$")
        self.frequency_corpus = defaultdict(int)
        self.merge_rules = []

    def learn_vocabulary(self, corpus: List[str], num_merges: int):
        # Initialize vocabulary with characters and their frequencies
        for sentence in corpus:
            for word in sentence.split():
#                 print("word is = ", word)
                modified_word = ""
                for char in word:
#                     print("char is = ", char)
                    self.vocab.add(char)
                    modified_word = modified_word + char + "-"
                modified_word = modified_word + "$"
                self.frequency_corpus[modified_word] += 1

        for merge in range(num_merges):
            pair_frequency = defaultdict(int)
            for word in self.frequency_corpus:
                symbols = word.split("-")
                for i in range(len(symbols) - 1):
                    pair_frequency[(symbols[i], symbols[i + 1])] += 1

            if not pair_frequency:
                break

            best_pair = max(pair_frequency, key=pair_frequency.get)
            self.vocab.add(best_pair[0]+best_pair[1])
            
            if best_pair not in self.merge_rules:
                self.merge_rules.append(best_pair)

            new_frequency_corpus = defaultdict(int)
            for word in self.frequency_corpus:
                new_word = word.replace(best_pair[0]+"-"+best_pair[1], best_pair[0]+best_pair[1])
                new_frequency_corpus[new_word] = self.frequency_corpus[word]

            self.frequency_corpus = new_frequency_corpus
            

    def tokenize(self, sample: str) -> List[str]:
        actual_tokens = []
        intermediate_tokens = set()

        sample_list = sample.split()

        for word in sample_list:
            word = word+"$"

            wordpart_list = []
            for letter in word:
                wordpart_list.append(letter)
                intermediate_tokens.add(letter)

            for rule in self.merge_rules:
                i=0
                while i<len(wordpart_list)-1:
                    if rule[0]+rule[1] == wordpart_list[i]+wordpart_list[i+1]:
                        intermediate_tokens.add(wordpart_list[i]+wordpart_list[i+1])
                        wordpart_list[i] = wordpart_list[i]+wordpart_list[i+1]
                        del wordpart_list[i+1]
                    i = i+1

            for subpart in wordpart_list:
                actual_tokens.append(subpart)
#                 if subpart not in actual_tokens:
#                     actual_tokens.append(subpart)

        print(intermediate_tokens)
        return actual_tokens
                


# # Example Usage
# corpus = ["hello", "world", "he", "she", "hers"]
# tokenizer = Tokenizer()
# tokenizer.learn_vocabulary(corpus, num_merges=10)
# sample_text = "helloworld"
# tokens = tokenizer.tokenize(sample_text)
# print(tokens)

In [3]:
# Read corpus from a text file
corpus_file_path = r"C:\Users\shree\OneDrive\Desktop\IIITD Sem 4\NLP\A1\corpus.txt"
with open(corpus_file_path, 'r', encoding='utf-8') as file:
    corpus = file.read().splitlines()

tokenizer = Tokenizer()
tokenizer.learn_vocabulary(corpus, 10)

In [7]:
tokenizer.tokenize("i love nlp")

{'i', 'l', 'e', 'v', 'n', 'e$', 'p', '$', 'o'}


['i', '$', 'l', 'o', 'v', 'e$', 'n', 'l', 'p', '$']

In [8]:
tokenizer.vocab

{'$',
 'a',
 'b',
 'c',
 'd',
 'd$',
 'e',
 'e$',
 'ed$',
 'er',
 'f',
 'g',
 'h',
 'i',
 'in',
 'ing',
 'ing$',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'on',
 'p',
 'q',
 'r',
 's',
 's$',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'y$',
 'z'}

In [9]:
tokenizer.merge_rules

[('s', '$'),
 ('i', 'n'),
 ('e', '$'),
 ('d', '$'),
 ('e', 'r'),
 ('e', 'd$'),
 ('in', 'g'),
 ('y', '$'),
 ('ing', '$'),
 ('o', 'n')]

In [10]:
tokenizer.frequency_corpus

defaultdict(int,
            {'i-$': 3789,
             's-t-a-n-d$': 10,
             'h-er-e$': 37,
             'f-e-e-l-$': 1637,
             'e-m-p-t-y$': 9,
             'a-$': 916,
             'c-l-a-s-s$': 13,
             'p-o-s-t-$': 14,
             'c-o-u-n-t-$': 4,
             'l-in-k-$': 2,
             'h-r-e-f-$': 25,
             'h-t-t-p-$': 30,
             'm-o-o-s-h-i-l-u-$': 1,
             'l-i-t-er-a-l-l-y$': 4,
             'j-u-s-t-$': 226,
             't-e-x-t-$': 3,
             't-y-c-h-e-l-l-e$': 1,
             't-o-$': 1340,
             's-e-e$': 45,
             'i-f-$': 134,
             's-h-e$': 72,
             'w-a-n-t-s$': 7,
             'h-a-n-g-$': 1,
             'o-u-t-$': 113,
             'b-e-c-a-u-s-e$': 183,
             'r-e-a-d-ing$': 13,
             'w-h-a-t-$': 125,
             'w-r-o-t-e$': 4,
             'a-b-o-u-t-$': 298,
             'm-y$': 642,
             'n-on-e-x-i-s-t-e-n-t-$': 1,
             's-o-c-i-a-l-$': 7,


# Dry Run on independent set of merge_rules 

In [15]:
merge_rules = [('r', '$'), ('e', 'r$'), ('e', 'w'), ('n', 'ew'), ('l', 'o'), ('lo', 'w'), ('new', 'er$'), ('low', '$')] 


def tokenize(sample: str) -> List[str]:
    actual_tokens = []
    intermediate_tokens = set()

    sample_list = sample.split()

    for word in sample_list:
        word = word+"$"

        wordpart_list = []
        for letter in word:
            wordpart_list.append(letter)
            intermediate_tokens.add(letter)

        for rule in merge_rules:
            i=0
            while i<len(wordpart_list)-1:
                if rule[0]+rule[1] == wordpart_list[i]+wordpart_list[i+1]:
                    intermediate_tokens.add(wordpart_list[i]+wordpart_list[i+1])
                    wordpart_list[i] = wordpart_list[i]+wordpart_list[i+1]
                    del wordpart_list[i+1]
                i = i+1
        
        print("wordpart_list = ", wordpart_list)
        for subpart in wordpart_list:
            actual_tokens.append(subpart)
        print("actual_tokens list = ", actual_tokens)
#             if subpart not in actual_tokens:
#                 actual_tokens.append(subpart)
        
    print(intermediate_tokens)
    return actual_tokens

In [17]:
tokenize('i love nlp')

wordpart_list =  ['i', '$']
actual_tokens list =  ['i', '$']
wordpart_list =  ['lo', 'v', 'e', '$']
actual_tokens list =  ['i', '$', 'lo', 'v', 'e', '$']
wordpart_list =  ['n', 'l', 'p', '$']
actual_tokens list =  ['i', '$', 'lo', 'v', 'e', '$', 'n', 'l', 'p', '$']
{'i', 'l', 'e', 'lo', 'v', 'n', 'p', '$', 'o'}


['i', '$', 'lo', 'v', 'e', '$', 'n', 'l', 'p', '$']