In [28]:
from email import header
import numpy as np
import pandas as pd
import fasttext.util
from difflib import SequenceMatcher
import fasttext
%pip install fasttext

# +++ CONSTANTS +++
# TODO rename these variables if the model or words files are not in the root folder
MODEL_PATH = "model.bin"
WORDS_PATH = "words.csv"
OUTPUT_PATH = "output.csv"
ILLEGAL_CHARS = '0123456789$,.@-'

# +++ FUNCTIONS +++
def isWordLegit(word, tabu):
    # ignore words that contain numbers or special characters
    illegal = set(ILLEGAL_CHARS)
    if not any((c in illegal) for c in tabu):
        if similar(word, tabu) < 0.7:
            return True
    return False


def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()


# +++ MAIN +++
# load the model
ft = fasttext.load_model(MODEL_PATH)

# load the words
csvfile = pd.read_csv(WORDS_PATH, header=None, on_bad_lines='warn')
print("Read {} words from the input file".format(len(csvfile)))

results = {}
# iterate through the words
for x in range(len(csvfile[0])):
    original_word = csvfile[0][x]
    if original_word is np.nan:
        continue
    original_word = original_word.lower()
    results[original_word] = []

    # use the model to get the 10 most similar words
    suggestions = ft.get_nearest_neighbors(original_word)

    # filter out the words that are not legit
    for i in range(10):
        suggestion = suggestions[i][1].lower()
        if isWordLegit(original_word, suggestion):
            results[original_word].append(suggestion)
# print(results)

#organize the results for the dataframe
output_array = []
for key in results:
    output_array.append([key]+results[key])
# sort the results (words with no suggestions are at the end)
output_array = sorted(output_array, key=len, reverse=True)

#create the output file
output = pd.DataFrame(output_array)
output.to_csv(OUTPUT_PATH, index=False, header=False)
print("Done, the output is saved in {}".format(OUTPUT_PATH))


Note: you may need to restart the kernel to use updated packages.




Read 10 words from the input file
Done, the output is saved in output.csv
