<a href="https://colab.research.google.com/github/jagadish-samarla/Alternate-Groups/blob/main/product_alternatives.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Optional code for mounting Drive and changing working directory

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
#import os
#os.chdir('/content/drive/MyDrive/Colab Notebooks/predicting_height_of_children')

### Run this code to install all Prerequisites

In [None]:
#!pip install -r requirements.txt

# Import Dependencies

In [None]:
import imageio
from io import BytesIO
import requests
from matplotlib import pyplot as plt
from urllib.request import urlopen
import json
import pandas as pd
import numpy as np
import urllib.request
import cv2
from sklearn.feature_extraction.text import TfidfVectorizer
import time
from sklearn.cluster import KMeans
import seaborn as sns
from sklearn.metrics import silhouette_score
from tqdm import tqdm
from sklearn.compose import ColumnTransformer
import re
from math import ceil
import ast
from sklearn.metrics import calinski_harabasz_score, silhouette_score

In [None]:
def get_json_data_from_url(url):
  '''this function paginates through domain url and returns json data of entire wibsite
  '''
  data ={}
  #to avoid time taking process let's limit the page count to 50
  print('Paginating through domain url')
  for i in tqdm(range(1, 20)):
    pagintating_url = url+ '/collections/all/products.json?page={}'.format(i)
    response = urlopen(pagintating_url)
    data_json = json.loads(response.read())
    for j,k in data_json.items():
      #print(k)
      if len(k) == 0:
        return data
      else:
        data[i] = data_json
  return data

In [None]:
def get_json_prod_wise(json_data):
  '''this function returns product wise json of entire website
  '''
  product_json = {}
  i = 0
  for k,v in json_data.items():
    for l,m in v.items():
      for item in m:
        product_json[i] = item
        i+=1
  return product_json

In [None]:
def get_image_url_from_json(json_data):
  '''this function takes json data and returns image_url
  '''
  #required_data = {}
  if json_data['images'] == []:
    return None
  else:
    return (json_data['images'][0]['src'])

In [None]:
def get_images_from_url(url):
  ''' this function takes image url as input and returns images as numpy ndarray
  '''
  res = urllib.request.urlopen(url)
  image_data = np.asarray(bytearray(res.read()), dtype=np.uint8)
  image_array = cv2.imdecode(image_data, cv2.IMREAD_COLOR)
  return image_array

In [None]:
def get_df_vectorized(text_df):
  '''This function returns vectorized form of pandas dataframe of text data
  '''
  vectorizer_ind = TfidfVectorizer(
    max_df = 0.95,
    min_df = 0.05,
    stop_words="english",
  )
  for col in text_df.columns: 
    vectorizer_whole = ColumnTransformer([('title', vectorizer_ind, 'format_title'),
                                ('tags', vectorizer_ind, 'tags')])
  vectorized_df = vectorizer_whole.fit_transform(text_df)
  return vectorized_df

In [None]:
def format_title(title):
  '''This function returns title of the product from json
  '''
  plain_title = re.sub('[^a-zA-Z0-9 ]', ' ', title)
  return plain_title.lower()

In [None]:
def get_tags_as_plain_text(p_json):
  '''This function returns tages of the product from json
  '''
  tags = p_json['tags']
  plain_tags = ' '.join([str(elem) for elem in ast.literal_eval(str(tags))])
  plain_tags = re.sub(r'[^a-zA-Z0-9 ]', '', plain_tags)
  return plain_tags.lower()

In [None]:
def get_best_of_k(min_k, max_k, v_df):
  '''This function returns best value of k for k-means clustering by Silhouette Analysis
  '''
  print('Finding best value of k')
  wcss = []
  #silhouette = []
  #calinski = []
  for k in tqdm(range(min_k, max_k)):
    km = KMeans(n_clusters = k, init="k-means++", n_init = 'auto', random_state = 1234, max_iter=50)
    km.fit(v_df)
    wcss.append(km.inertia_)
    #silhouette.append(silhouette_score(v_df, km.labels_))
    #x = v_df.toarray()
    #calinski.append(calinski_harabasz_score(x, km.labels_))
  wcss_series = pd.Series(wcss, index = range(min_k, max_k))
  #calinski_series = pd.Series(calinski, index = range(min_k, max_k))
  #silhouette_series = pd.Series(silhouette, index = range(min_k, max_k))
  #metric_df = pd.DataFrame([wcss_series, calinski_series, silhouette_series ], columns = ['wcss', 'calinski', 'silhouette'])
  best_k = wcss_series[wcss_series.diff(periods=-1)>2].reset_index(drop=True).idxmin()
  print('Found best value of k at {}'.format(best_k))
  #return wcss_series, calinski_series, silhouette_series, silhouette_series.idxmax()
  return best_k

In [None]:
def mapping_title_to_url(url, json_handle):
  '''This function returns product url from the title
  '''
  #title_part = re.sub(' ', '-', title.lower())
  product_url = url + '/products/'+ json_handle
  return product_url

In [None]:
def get_prod_dataframe(p_df, dom_url):
  '''This function returns detailed dataframe of required fields for k-means clustering
  '''
  p_df['title'] = p_df['prod_json'].apply(lambda x: x['title'] if x else None)
  p_df['format_title'] = p_df['title'].apply(lambda x: format_title(x))
  p_df['handle'] = p_df['prod_json'].apply(lambda x: x['handle'] if x else None)
  p_df['tags'] = p_df['prod_json'].apply(lambda x: get_tags_as_plain_text(x) if x else None)
  p_df['product_url'] = p_df['handle'].apply(lambda x: mapping_title_to_url(dom_url, x))
  #data_df['Image_url'] = data_df.prod_json.apply(lambda x: get_image_url_from_json(x))
  #data_df['Image_array'] = data_df.Image_url.apply(lambda x: get_images_from_url(x))
  p_df = p_df.dropna(axis=0)
  p_df = p_df.drop_duplicates(subset=['handle'], keep='last')
  return p_df

In [None]:
def get_clustering_dict(df): 
  '''This function culsters the products and returns in dict 
  '''
  alt = df.groupby('cluster_id', sort=True)['product_url'].apply(list).to_dict()
  cluster_dict = {}
  for k, v in alt.items():
    cluster_dict['product alternates {}'.format(int(k)+1)] = v
  return cluster_dict

In [None]:
def FindAlternateGroups(store_domain_url):
  '''This is the core function.
     This function takes store domain url as input 
     and returns product title & image array as key& value pairs of python dictionary 
  '''
  paginated_json_data=get_json_data_from_url(store_domain_url)
  prod_wise_json = get_json_prod_wise(paginated_json_data)
  prod_series = pd.Series(list(prod_wise_json.values()))
  data_df = pd.DataFrame(prod_series, columns=['prod_json'])
  return data_df, store_domain_url

# Replace Domain URL here
##### example format : https://www.woolsboutiqueuomo.com



In [None]:
products_df, store_url = FindAlternateGroups('https://berkehome.pl')
#products_df, store_url = products_df, 'https://sartale2022.myshopify.com'
products_df = get_prod_dataframe(products_df, store_url)
vect_df = get_df_vectorized(products_df)
min_k = 2
max_k = ceil(len(products_df)/3)
k_value = get_best_of_k(min_k, max_k, vect_df)

kmeans = KMeans(
          n_clusters=k_value,
          max_iter=100,
          n_init='auto',
          random_state=1234,
      ).fit(vect_df)

products_df['cluster_id'] = pd.Series(kmeans.labels_)

alternate_dict = get_clustering_dict(products_df)

print(alternate_dict)

In [None]:
alternate_dict

### (Optional code to export clustered json as txt file)

In [None]:
#optional code to export json


with open("/content/drive/MyDrive/Colab Notebooks/alternate groups/berkehome.txt", 'w') as f: 
    for key, value in alternate_dict.items(): 
        f.write('%s:%s\n' % (key, value))

In [None]:
#!pip freeze '/content/drive/MyDrive/Colab Notebooks/Alternate Groups/requirements.txt'