### Importing Libraries

In [8]:
import itertools 
import json
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.api as sm
import scipy
import seaborn as sns
import warnings
from scipy import stats
from datetime import timedelta
import nltk
from nltk.tokenize import word_tokenize 
nltk.download('punkt')
import string
import re
import pickle
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from collections import Counter


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rochitranjan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rochitranjan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Data Pre-processing

In [9]:
kristal = pd.read_excel('C:/Users/rochitranjan/Documents/kristal/Kristal.xlsx', sheet_name = 'Sheet1')

def find_pattern(patterns, text):
    if re.findall(patterns, text):
        return re.findall(patterns, text)
    else:
        return 'Not Found!'

stopwords = stopwords.words('english')

kristal['contains_dollar'] = kristal['product_price'].apply(lambda x : 1 if '$' in x else 0 )


#ASSUMPTION : for the products have multiple price listed in the 'product_price' column the first one is the right price. 
#This functionality can further be enhanced using IOB labelling to indentify the Selling price and the List Price
kristal['price'] =kristal['product_price'].apply(lambda x : find_pattern( "\$([0-9,]*\.[0-9]*)", x)[0] )

kristal.drop(labels =['contains_dollar','product_price'], axis = 1, inplace = True)

kristal['product_title']=kristal['product_title'].apply(lambda x : str(x).lower())

#kristal['product_title'] = kristal['product_title'].apply(lambda x : re.sub(r'[^\w\s]', '', str(x)))

kristal['tokens'] = kristal['product_title'].apply(lambda x : word_tokenize(str(x)))

kristal['tokens'] = kristal['tokens'].apply(lambda x : list(filter(lambda token: token not in string.punctuation, x)))

kristal['tokens'] = kristal['tokens'].apply(lambda x : list(filter(lambda token: token not in stopwords, x)))

kristal['tokens'] = kristal['tokens'].apply(lambda tokens : word_tokenize(re.sub(r'[^\w\s]', '', str(' '.join(tokens)))))

kristal['tokens'] = kristal['tokens'].apply(lambda tokens : word_tokenize(re.sub(r'[0-9]+', '', str(' '.join(tokens)))))

kristal['tokens'] = kristal['tokens'].apply(lambda x : list(filter(lambda token: len(token) >1 , x)))

vocab = sum(kristal['tokens'].tolist(), [])

with open('C:/Users/rochitranjan/Documents/kristal/vocab.txt', 'w', encoding="utf-8") as fp:
    fp.write(str(vocab))

fp.close()

In [10]:
vocab_dct = dict(Counter(vocab))

### Find Products in the Dataframe

`Run the below function to find products in the Dataframe if the search token is present in the vocab.txt file`

In [34]:
def find_products(search_item, df_inp):
    search_item = search_item.lower()
    df = df_inp.copy()
    df['count'] = 0
    if search_item in vocab:
        df['count'] = df['tokens'].apply(lambda tokens : tokens.count(search_item))
    df = df[df['count'] >= 1]
    df.sort_values(by = ['count', 'price'], ascending = [False, True], inplace = True)
    if len(df) >= 1:
        return df.iloc[0]
    else:
        return None

`Run the below function to clean search token if the search token is NOT present in the vocab.txt file`

In [32]:
def clean_search_item(search_item):
    punct = list(string.punctuation)
    punct.extend(list(range(10)))
    search_item = ''.join(list(filter(lambda char: char not in punct, search_item)))
    return search_item

`Run the below functions to run fuzzy matching logic if search token is NOT present in the vocab.txt file. This logic suggests  the correct word basis one or two edits(of the original search token) and the probability of occurence of the suggested edits in the vocab.txt file`

In [51]:
def edits_one(word):
    "Create all edits that are one edit away from `word`."
    alphabets    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])                   for i in range(len(word) + 1)]
    deletes    = [left + right[1:]                       for left, right in splits if right]
    inserts    = [left + c + right                       for left, right in splits for c in alphabets]
    replaces   = [left + c + right[1:]                   for left, right in splits if right for c in alphabets]
    transposes = [left + right[1] + right[0] + right[2:] for left, right in splits if len(right)>1]
    return set(deletes + inserts + replaces + transposes)

def edits_two(word):
    "Create all edits that are two edits away from `word`."
    return (e2 for e1 in edits_one(word) for e2 in edits_one(e1))

def known(words):
    "The subset of `words` that appear in the `all_words`."
    return set(word for word in words if word in all_words)

def possible_corrections(word):
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits_one(word)) or known(edits_two(word)) or [word])

def prob(word, N=sum(all_words.values())): 
    "Probability of `word`: Number of appearances of 'word' / total number of tokens"
    return all_words[word] / N


def spell_check(word):
    "Print the most probable spelling correction for `word` out of all the `possible_corrections`"
    correct_word = max(possible_corrections(word), key=prob)
    if correct_word != None:
        return correct_word
    else:
        return None

### Test Case 1: When search token does not have punctuation and it is NOT in the VOCAB.TXT file

In [16]:
search_item = 'plstation'
search_item = search_item.lower()
all_words = Counter(vocab)
possible_corrections(search_item)

{'paystation', 'plantation', 'playstation', 'station'}

In [69]:
# test spel
correct_word = spell_check('plstation')
if correct_word != None:
    result = find_products(correct_word, kristal)
    print(result['product_title'])
    print(result['price'])
else:
    print('No match Found')

the unofficial playstation handbook : a guide to using playstation 4, playstation tv, and playstation 3 (paperback)
14.99


### Test Case 2: When search token does have a punctuation and it is NOT in the VOCAB.TXT file

In [68]:
correct_word = spell_check('anti-dark')
result = find_products(correct_word, kristal)
print(result['product_title'])
print(result['price'])

24k gold ballpoint eye cream antidark circle aging wrinkle collagen essence care
3.61


FUTURE WORK: 
1. The search token can be enhanced to support multiple word search and incoporate the word ordering to the tokens present in the search token.
2. Similarity Ratio(E.g Levenshtein Similarity Ratio/Jaccards Ratio etc) can also be utilised to enhance the spell correction feature of the proposed solution.
3. NER techniques can be utilised to enhance the price extraction feature of the proposed solution.