### Normalize and Get TFIDF of each Product Name

In [1]:
import json
import matplotlib
import matplotlib.pyplot as plt
import networkx as nx
import nltk
import pandas as pd
import os
import json
import pickle
import re
import spacy

from nltk.corpus import stopwords
from sklearn.feature_extraction.text  import CountVectorizer 
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

nlp = spacy.load("en_core_web_sm")
stop_words = set(stopwords.words('english'))

def get_tfidf(product_details):
    clean_product = []
    product_name = list(product_details)
    for i in range(len(product_name)):
        words = ""

        doc = nlp(product_name[i].lower())
        for token in doc:
            token.lemma_ = re.sub(r'\W',' ',token.lemma_)
            token.lemma_ = token.lemma_.strip()
            if not token.lemma_.endswith("ml") and not token.lemma_.endswith("ms") and not token.lemma_.isdigit() and not token.lemma_ in stop_words:
                if len(token.lemma_) > 2 or token.lemma_ == 'uv': 
                    words += token.lemma_.lower() + " "
                    

        if len(words) > 0:
            clean_product.append(str(words.strip()))

    tfidf_vectorizer=TfidfVectorizer(use_idf=True) 
    tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(clean_product)
    first_vector_tfidfvectorizer=tfidf_vectorizer_vectors[0]

    df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(), 
                      columns=["tfidf"]) 
    df = df.sort_values(by=["tfidf"], ascending=False).reset_index()
    
    return df

### Network Theory

In [None]:
G = nx.Graph()
entries = os.listdir('dataset/updated/') ## Please Change Directory
main_category_list = []
sub_category_list = []
sub_category_list_2 = []
product_name_list = []

dataset  = pd.read_csv('dataset/{}.csv'.format(category), error_bad_lines=False)  ## Please Change Directory
dataset['Main Category'] = dataset['Main Category'].str.lower()
dataset['Sub Category 1'] = dataset['Sub Category 1'].str.lower()
dataset['Sub Category 2'] = dataset['Sub Category 2'].str.lower()

main_category = dataset['Main Category'].unique()
for _main_category in main_category:
    if type(_main_category) == str:
        print(_main_category)
        main_category_data = dataset[dataset['Main Category'] == _main_category]
        G.add_node(_main_category)
        main_category_list.append(_main_category)
        
        for row in main_category_data['Sub Category 1'].unique():
            if type(row) == str:
                sub_category_list.append(row)
                G.add_edge(row, _main_category, weight=1.0)
                
                for row2 in main_category_data.loc[main_category_data['Sub Category 1'] == row]['Sub Category 2'].unique():
                    if type(row2) == str:
                        G.add_edge(row2.strip(), row.strip(), weight=1.0)
                        sub_category_list_2.append(row2.strip())
                        
                        tfidf_result = get_tfidf(main_category_data.loc[main_category_data['Sub Category 2'] == row2]['Product Name'])
                        
                        index =  tfidf_result['index']
                        tfidf = tfidf_result['tfidf']
                        counter = 0
                        for _tfidf_result in tfidf_result['index']:
                            if float(tfidf[counter]) > 0.0:                    
                                if index[counter] not in main_category_list:
                                    product_name_list.append(index[counter].lower())
                                    G.add_edge(index[counter].lower(), row2.strip(), weight=tfidf[counter])
    
    
                            counter +=1

### Convert Network Theory to Pickle File

In [None]:
import pickle

with open('network_theory.pickle','wb') as fe_data_file:
     pickle.dump(G, fe_data_file)

In [None]:
BETWEENNESS_CENTRALITY = nx.betweenness_centrality(G)

with open('betweeness_centrality.pickle','wb') as fe_data_file:
     pickle.dump(G, fe_data_file)

### Read Pickle File

### Get Neighbors and Nodes

In [None]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(given_wishlist.strip())

result_categories = []

for token in reversed(doc):
    if token.text in list(G.nodes()):
        print(token.lemma_)
        closeness_centrality_list = []
        betweness_centrality_list = []
        degree_list = []
        neighbor_list = []
        shortest_path_list = []
        length_list = []

        for _neighbors in list(G.neighbors(token.text)):
            if _neighbors in sub_category_list_2:
                neighbor_list.append(_neighbors)
                betweness_centrality_list.append(between_centrality_json[_neighbors])
                shortest_path = nx.shortest_path(G, source=_neighbors, target=token.lemma_)
                shortest_path_list.append(len(shortest_path))
                length_list.append(overall_data.loc[overall_data['Sub Category 2'] == _neighbors].shape[0])

        network_result = pd.DataFrame(neighbor_list, columns=['neighbor'])
        network_result['betweeness_centrality'] = betweness_centrality_list
        network_result['shortest_path'] = shortest_path_list

        if len(betweness_centrality_list) > 0:
            if network_result[network_result['shortest_path'] == min(shortest_path_list)]['neighbor'].shape[0] < 2:
                if list(network_result[network_result['shortest_path'] == min(shortest_path_list)]['neighbor'])[0] not in result_categories:
                    result_categories.append(list(network_result[network_result['shortest_path'] == min(shortest_path_list)]['neighbor'])[0])
            else:
                if list(network_result[network_result['betweeness_centrality'] == min(betweness_centrality_list)]['neighbor'])[0] not in result_categories:
                    result_categories.append(list(network_result[network_result['betweeness_centrality'] == min(betweness_centrality_list)]['neighbor'])[0]) 
merge_products = []
for _result_categories in result_categories:
    merge_products.append(overall_data.loc[(overall_data['Sub Category 2'] == _result_categories.title())])
    
selected_category = pd.concat(merge_products).reset_index()

### Getting Relevance Per Products

In [None]:
from sklearn.neighbors import NearestNeighbors

sample_wishlist = ['jogger pants']

vectorize = TfidfVectorizer(stop_words='english')
tfidf_response= vectorize.fit_transform(selected_category['Product Name'])
dtm = pd.DataFrame(tfidf_response.todense(), columns = vectorize.get_feature_names())

nn = NearestNeighbors(n_neighbors=selected_category.shape[0])
nn.fit(dtm)

new = vectorize.transform(sample_wishlist)
knn_model_result = nn.kneighbors(new.todense())

knn_result = pd.DataFrame(knn_model_result[0][0][0:], columns=['Distance'])
knn_result["Product Name"] = selected_category['Product Name'][knn_model_result[1][0][0:]]

merged_result = pd.merge(selected_category, knn_result, on='Product Name', how='inner')
merged_result = merged_result.drop_duplicates(subset='Product Name', keep="first")