In [23]:
# Import all the dependencies
import string
from tqdm.auto import tqdm
import json
import os

In [24]:
# Set file locations to your datasets
path = os.path.dirname(os.getcwd())
fileName = os.path.join(path, 'stemmingbengaliwords','Datasets','smallSample.tsv')

In [25]:
# Noise cancellation
# Remove all the non-bengali characters
def RemoveNoiseFromDictionary(fileName):
    dictionary = {}
    with open(fileName,"r") as rawFile:
        lines = rawFile.readlines()
        for line in tqdm(lines):
            if line[0] != "#"and line[0] not in string.ascii_letters and line.strip() :
                dictionary[line.split()[0] ]= []
    return dictionary


We can verify whether a `StringB` is a valid suffix of `StringA` by implementing a logic like this :
`StringA[len(StringA)-len(StringB):] == StringB`

Which as same as leveraging the below algorithm:

```Python
def GetLCSLength(stringA, stringB, stringALength, stringBlength):
    if (stringALength == 0 or stringBlength == 0):
        return 0
    if (stringA[stringALength-1] == stringB[stringBlength-1]):
        return 1 + GetLCSLength(stringA, stringB, stringALength-1, stringBlength-1)
    else:
        return max(GetLCSLength(stringA, stringB, stringALength, stringBlength-1), GetLCSLength(stringA, stringB, stringALength-1, stringBlength))

def IsSuffix(stringA, stringB,stringALength, stringBlength):
    return True if (GetLCSLength(stringA, stringB, stringALength, stringBlength)== stringBlength) else False
```


In [26]:
# A function to generate iterations
## Using a dictionary to avoid duplication entry and for O(c) access
def GenerateIteration(dictionary):
    iteration = dictionary
    # Append origin words to the trailing strings' list
    # Bruteforce approach is required because we are considering every word as an trailing subsequence
    # This is the 1st iteration
    for stringA in tqdm(iteration.keys()):
        for stringB in iteration.keys():
            if stringA != stringB and stringB[len(stringB)-len(stringA):] == stringA:
                iteration[stringA].append(stringB[:len(stringB)-len(stringA)])
    return iteration


In [28]:
# Removes unnecessary entries
def RemoveUnnecessaryEntries(dictionary):
    ## Don't stem for alphabets
    ## Don't keep empty sets for a key
    for key,value in tqdm(dictionary.copy().items()):
        if len(key) <= 2 or len(value) == 0:
            dictionary.pop(key)

# Generates JSON file for saving the output
## Don't add file extension
def GenerateJSON(dictionary, fileName):
    ## save the output as json
    fileName = os.path.join(path, 'stemmingbengaliwords','Output',fileName+'.json')
    with open(fileName, 'w') as jsonObject:
        json.dump(dictionary, jsonObject, ensure_ascii=False)

# Generates the reverse of a dictionary
def GenerateReversedDictionary(dictionary):
    dictionaryReversed = {}
    for key in tqdm(dictionary.keys()):
        for value in dictionary[key]:
            if value not in dictionaryReversed:
                dictionaryReversed[value] = [key]
            else:
                dictionaryReversed[value].append(key)
    return dictionaryReversed

# Generates the origin word - derived word dictionary
def GenerateWordMap(dictionary):
    wordMap = {}
    for key,value in tqdm(dictionary.items()):
        words = []
        for x in value:
            derivedWord = key + x;
            words.append(derivedWord)
        for word in words:
            wordMap[key].append(word)
    return wordMap

What will be doing in out 1st iteration is simply taking out the trailing strings. It should look something like this :

|Key (Trailing Substring)  |Value (Origin Words)   |
|:---|:---|
|কলা   | {চারু,কারু,চারুকারু}  |
|অর্ডার  | {টপ, ডিজ, ডিস, মিডল}  |


In [29]:

## This dictionary will hold the first iteration subsequences as the keys and the extensions as values
#
cleanData = RemoveNoiseFromDictionary(fileName)
firstIteration = GenerateIteration(cleanData)
firstIteration = RemoveUnnecessaryEntries(firstIteration)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




In [30]:
cleanData

{'কলা': ['চারু', 'কারু', 'চারুকারু'], 'কারুকলা': ['চারু']}

In [31]:
firstIteration

In [20]:

firstIteration = RemoveUnnecessaryEntries(firstIteration)
GenerateJSON(firstIteration, "firstIteration")
firstIteration

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




In [21]:
firstIteration

In [22]:
# Now let us construct the word map from first iteration
reversedFirstIteration = GenerateReversedDictionary(firstIteration)
GenerateJSON(reversedFirstIteration,'reversedFirstIteration')
wordsMap = GenerateWordMap(reversedFirstIteration)
wordsMap

AttributeError: 'NoneType' object has no attribute 'keys'