In [1]:
# store_domain = "https://www.boysnextdoor-apparel.co"
# store_domain = "https://www.woolsboutiqueuomo.com"
store_domain = "https://sartale2022.myshopify.com"
# store_domain = "https://berkehome.pl"
# store_domain = "https://glamaroustitijewels.com"
# store_domain = "https://lampsdepot.com"
# store_domain = "https://kitchenoasis.com"


In [2]:
def FindAllGroups(store_domain,max_df = 20,min_df = 0.0,eps=0.01,min_samples=1):
    # libraries
    import numpy as np
    import pandas as pd
    import json
    import operator
    import requests
    import matplotlib.pyplot as plt

    from sklearn.cluster import DBSCAN

    from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
    from sklearn.metrics import homogeneity_score, completeness_score

    from sklearn.feature_extraction.text import TfidfVectorizer
    
    # function to sort dictionary
    def frequency_dict_sorted(input_list):
        freq_dict = {}
        for item in input_list:
            if item in freq_dict:
                freq_dict[item] += 1
            else:
                freq_dict[item] = 1

        freq_dict_sorted = dict(sorted(freq_dict.items(), key=operator.itemgetter(1),reverse=True))
        return freq_dict_sorted
    products_url = f"{store_domain}/collections/all/products.json"
    all_products = []
    page_number = 1

    # fetch all products from the store
    while True:
        page_url = f"{products_url}?page={page_number}"
        response = requests.get(page_url)
        if response.status_code != 200:
            break
        products_data = response.json()
        if not products_data:
            break
        all_products.extend(products_data['products'])
        page_number += 1

    # storing the data in a dataframe
    data = pd.DataFrame()
    product_title_list = []
    product_handle_list = []
    product_vendor_list = []
    product_type_list = []
    product_tags_list = []

    for i in range(len(all_products)):
        product_title_list.append(all_products[i]['title'])
        product_handle_list.append(all_products[i]['handle'])
        product_vendor_list.append(all_products[i]['vendor'])
        product_type_list.append(all_products[i]['product_type'])
        product_tags_list.append(all_products[i]['tags'])

    data['title'] = product_title_list
    data['handle'] = product_handle_list
    data['vendor'] = product_vendor_list
    data['product_type'] = product_type_list
    data['tags'] = product_tags_list
    
    # tags present in the given data are used to obtain features

    tags = list(set(data['tags'].explode().values))    
    
    # tf-idf vectorizer is used to gain a control over the selection of useful features, in turn contributing towards the hyperpameters of the function
    vectorizer = TfidfVectorizer(max_df=max_df,min_df=min_df,token_pattern=r'[^,]+')

    data['tags_str'] = [','.join(item).lower() for item in data['tags']]
    X = vectorizer.fit_transform(data['tags_str'])
    print(f"all tags\t=\t{len(tags)}\nselected tags\t=\t{len(vectorizer.get_feature_names_out())}")

    # use of DBSCAN clustering over kmeans as we do know the estimate of number of clusters present in data
    clustering_DBSCAN= DBSCAN(eps=eps, min_samples=min_samples).fit(X)
    data['labels'] = clustering_DBSCAN.labels_

    print(f"""Metric
    \nsilhouette score\t=\t{silhouette_score(X.toarray(),clustering_DBSCAN.labels_)}
    \ncalinski harabasz score\t=\t{calinski_harabasz_score(X.toarray(),clustering_DBSCAN.labels_)}
    \ndavies bouldin score\t=\t{davies_bouldin_score(X.toarray(),clustering_DBSCAN.labels_)}
    \nNumber of elements\t=\t{len(clustering_DBSCAN.labels_)}
    \nNumber of clusters\t=\t{len(set(clustering_DBSCAN.labels_))}
    \nLargest cluster\t\t=\t{list(frequency_dict_sorted(clustering_DBSCAN.labels_).values())[0]}
    \nSecond largest cluster\t=\t{list(frequency_dict_sorted(clustering_DBSCAN.labels_).values())[1]}
    \nSmallest cluster\t=\t{list(frequency_dict_sorted(clustering_DBSCAN.labels_).values())[-1]}
    """)
    clusters_list = list(frequency_dict_sorted(clustering_DBSCAN.labels_).keys())[1:] # largest cluster is excluded as it is ofen too large
    output_list = []
    
    # preparation for the json output
    for i in clusters_list:
#     for i in [102, 14]:
        product_varients = []
        temp = data[data['labels'] == i]
        for i in range(len(temp)):
            product_link = f"{store_domain}/products/{temp['handle'].iloc[i]}"
            product_varients.append(product_link)
        product_varients_dict = {}
        product_varients_dict["product variations"] = product_varients
        output_list.append(product_varients_dict)
        output = json.dumps(output_list)
    return output
    

In [3]:
FindAllGroups(store_domain)

all tags	=	564
selected tags	=	404
Metric
    
silhouette score	=	0.8421052629316981
    
calinski harabasz score	=	2.3635720312919368e+34
    
davies bouldin score	=	1.5916150435670483e-08
    
Number of elements	=	2223
    
Number of clusters	=	523
    
Largest cluster		=	1123
    
Second largest cluster	=	16
    
Smallest cluster	=	1
    


'[{"product variations": ["https://sartale2022.myshopify.com/products/all-day-long-collection-striped-cotton-light-blue-shirt-with-cutaway-collar", "https://sartale2022.myshopify.com/products/all-day-long-collection-striped-cotton-light-blue-shirt-with-shark-collar", "https://sartale2022.myshopify.com/products/checked-cotton-light-blue-shirt-with-cutaway-collar", "https://sartale2022.myshopify.com/products/cotton-striped-shirt-with-round-french-cuff", "https://sartale2022.myshopify.com/products/cotton-striped-shirt-with-round-french-cuff-2", "https://sartale2022.myshopify.com/products/linen-and-cotton-blend-striped-oxford-shirt", "https://sartale2022.myshopify.com/products/micro-checked-cotton-shirt-in-light-blue-1", "https://sartale2022.myshopify.com/products/multi-striped-cotton-shirt-in-light-blue", "https://sartale2022.myshopify.com/products/striped-alums-cotton-shirt-in-light-blue", "https://sartale2022.myshopify.com/products/striped-cotton-shirt-1", "https://sartale2022.myshopify