In [169]:
# Import all the dependencies
import string
from tqdm.auto import tqdm
import json
import os

In [170]:
# Set file locations to your datasets
path = os.path.dirname(os.getcwd())
fileName = os.path.join(path, 'stemmingbengaliwords','Datasets','bengaliWords.tsv')

In [171]:
# Noise cancellation
# Remove all the non-bengali characters
def RemoveNoiseFromDictionary(fileName):
    dictionary = {}
    with open(fileName,"r") as rawFile:
        lines = rawFile.readlines()
        for line in tqdm(lines):
            if line[0] != "#"and line[0] not in string.ascii_letters and line.strip() :
                dictionary[line.split()[0] ]= []
    return dictionary


We can verify whether a `StringB` is a valid suffix of `StringA` by implementing a logic like this :
`StringA[len(StringA)-len(StringB):] == StringB`

Which as same as leveraging the below algorithm:

```Python
def GetLCSLength(stringA, stringB, stringALength, stringBlength):
    if (stringALength == 0 or stringBlength == 0):
        return 0
    if (stringA[stringALength-1] == stringB[stringBlength-1]):
        return 1 + GetLCSLength(stringA, stringB, stringALength-1, stringBlength-1)
    else:
        return max(GetLCSLength(stringA, stringB, stringALength, stringBlength-1), GetLCSLength(stringA, stringB, stringALength-1, stringBlength))

def IsSuffix(stringA, stringB,stringALength, stringBlength):
    return True if (GetLCSLength(stringA, stringB, stringALength, stringBlength)== stringBlength) else False
```


In [172]:
# A function to generate iterations
## Using a dictionary to avoid duplication entry and for O(c) access
def GenerateIteration(dictionary):
    # Append origin words to the trailing strings' list
    # Bruteforce approach is required because we are considering every word as an trailing subsequence
    # This is the 1st iteration
    for stringA in tqdm(dictionary.keys()):
        for stringB in dictionary.keys():
            if stringA != stringB and stringB[len(stringB)-len(stringA):] == stringA:
                dictionary[stringA].append(stringB[:len(stringB)-len(stringA)])


In [173]:
# Removes unnecessary entries
def RemoveUnnecessaryEntries(dictionary):
    ## Don't stem for alphabets
    ## Don't keep empty sets for a key
    for key,value in tqdm(dictionary.copy().items()):
        if len(key) < 2 or len(value) == 0:
            dictionary.pop(key)

# Generates JSON file for saving the output
## Don't add file extension
def GenerateJSON(dictionary, fileName):
    ## save the output as json
    fileName = os.path.join(path, 'stemmingbengaliwords','Output',fileName+'.json')
    with open(fileName, 'w') as jsonObject:
        json.dump(dictionary, jsonObject, ensure_ascii=False)

# Generates the reverse of a dictionary
def GenerateReversedDictionary(dictionary):
    dictionaryReversed = {}
    for key in tqdm(dictionary.keys()):
        for value in dictionary[key]:
            if value not in dictionaryReversed:
                dictionaryReversed[value] = [key]
            else:
                dictionaryReversed[value].append(key)
    return dictionaryReversed

# Generates the origin word - derived word dictionary
def GenerateWordMap(dictionary):
    wordMap = {}
    for key,value in tqdm(dictionary.items()):
        words = []
        for suffix in value:
            word = key + suffix
            words.append(word)
        if key not in wordMap:
            wordMap[key] = words
        else:
            wordMap[key].append(words)
    return wordMap

What will be doing in out 1st iteration is simply taking out the trailing strings. It should look something like this :

|Key (Trailing Substring)     |Value (Origin Words)     |
|:---|:---|
|কলা   | {চারু,কারু,চারুকারু}  |
|অর্ডার  | {টপ, ডিজ, ডিস, মিডল}  |

Then we will reverse it and join them:

|Key (Origin)    |Value (Trailing subsequence)     | Key + Value (Derivatives)   |
|:---|:---|:---|
|চারু| {কলা,কারুকলা}|{চারুকলা,চারুকারুকলা}|
|চারুকারু|{কলা}|{চারুকারুকলা}|
| ডিজ  | {অর্ডার}  |{ডিজঅর্ডার}|


In [174]:
# This is how we will generate an iteration
iteration1 = RemoveNoiseFromDictionary(fileName)
GenerateIteration(iteration1)
RemoveUnnecessaryEntries(iteration1)
GenerateJSON(iteration1,'firstIteration')
reversedIteration1 = GenerateReversedDictionary(iteration1)
GenerateJSON(iteration1,'firstIterationReversed')
iteration1WordMap = GenerateWordMap(reversedIteration1)
RemoveUnnecessaryEntries(iteration1WordMap)
GenerateJSON(iteration1,'firstIterationWordMap')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=65057.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=60062.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=60062.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=7313.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=23741.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=23741.0), HTML(value='')))




In [175]:
iteration1WordMap

{'ডাই': ['ডাইঅক্সাইড',
  'ডাইনি',
  'ডাইনে',
  'ডাইভার',
  'ডাইমেনশন',
  'ডাইরেক্টর',
  'ডাইসে',
  'ডাইসের'],
 'সম': ['সমঅধিকার',
  'সমকক্ষ',
  'সমকাল',
  'সমকালকে',
  'সমকালীন',
  'সমকালে',
  'সমকালের',
  'সমকোণ',
  'সমকোণে',
  'সমতল',
  'সমতলে',
  'সমতলের',
  'সমতা',
  'সমতাও',
  'সমতার',
  'সমতুল্য',
  'সমপরিমাণ',
  'সমপরিমান',
  'সমপ্রতি',
  'সমপ্রসারণ',
  'সমবেত',
  'সমবেদনা',
  'সমবেদনার',
  'সমভাবে',
  'সমভূমি',
  'সমভূমির',
  'সমমনা',
  'সমমর্যাদা',
  'সমমর্যাদার',
  'সমমান',
  'সমমানের',
  'সমমূল্যের',
  'সময়',
  'সমরে',
  'সমরেশ',
  'সমশের',
  'সমসংখ্যক',
  'সমস্বরে',
  'সমস্যা',
  'সমস্যার',
  'সমস্যারও',
  'সমহারে'],
 'ফলো': ['ফলোঅন', 'ফলোঅনের', 'ফলোআপ', 'ফলোআপে'],
 'ডিজ': ['ডিজঅনার', 'ডিজঅর্ডার', 'ডিজনি'],
 'বক্স': ['বক্সঅফিসে'],
 'মিড': ['মিডঅফে',
  'মিডফিল্ড',
  'মিডফিল্ডার',
  'মিডফিল্ডারের',
  'মিডফিল্ডে',
  'মিডল্যান্ড'],
 'গণ': ['গণঅবস্থান',
  'গণঅভ্যুত্থানে',
  'গণঅভ্যুত্থানের',
  'গণআন্দোলন',
  'গণআন্দোলনের',
  'গণউপাসনালয়',
  'গণকবর',
  'গণকবরে',
  'গণকবরের',
  