# Product Grouping Analysis
## Summary
## 1. Initialize Notebook
This section initializes the necessary packages and defines auxiliary functions.

### 1.1 Notebook Settings

In [1]:

## set transformer model 
#py_model_name = 'all-MiniLM-L6-v2' # primarily English
#py_model_name = 'paraphrase-multilingual-MiniLM-L12-v2' # multilingual
py_model_name = 'LaBSE' 
#py_model_name = 'xlm-r-100langs-bert-base-nli-stsb-mean-tokens' 
py_model_name = 'intfloat/multilingual-e5-base'


# runner ups to try, according to ChatGPT:
# - XLM-RoBERTa
# - mBART-50
# - MT5 (Multilingual T5)
# - ByT5 (handles raw text at the byte level, ideal for multilingual similarity tasks).
# - mLUKE (multilingual version of LUKE, good for named entity recognition and contextual understanding).
# - mBERT (baseline multilingual BERT, effective but less performant than XLM-R).

n_cores = 8


### 1.2 Load Packages

In [None]:

import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"
os.environ["SIM_EMBEDDINGS_CORES"] = f"{n_cores}"

import pandas as pd
import polars as pl

import random
import pickle
import numpy as np
from tqdm import tqdm
import sentence_transformers as st

import nltk
from nltk.tokenize import word_tokenize

import gensim
from gensim.models import Word2Vec

from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA

import multiprocessing
multiprocessing.set_start_method("forkserver", force=True)

import torch
torch.set_num_threads(n_cores)

from plotnine import *

from collections import defaultdict

import json
from scipy.cluster.hierarchy import linkage, to_tree



  from .autonotebook import tqdm as notebook_tqdm


### 1.3 Globally Used Code

In [3]:
# Enable the underscore notation for polars columns, similar to ibis and dplyr (i.e., `pl.col("column_name")` -> `_.column_name`)
class PolarsColumnNamespace:
    def __getattr__(self, name):
        return pl.col(name)

# Enable _ as a shorthand for that class
_ = PolarsColumnNamespace()

## 2. Prepare Transactional Data
### 2.1 Read Transactional Data
mention that these are two datasets from UCI

In [4]:
df1 = pl.read_excel('./data/online+retail/Online Retail.xlsx')
df2 = pl.read_excel('./data/online+retail+ii/online_retail_II.xlsx')
df2 = df2.rename({"Invoice": "InvoiceNo","Price": "UnitPrice", "Customer ID": "CustomerID"})
df = df1.vstack(df2)

### 2.2 Clean Up Dataset
#### 2.2.1 Missing Invoice IDs
- A number of records do not have an invoice id. We'll exclude them, because we cannot use them to determine the basket co-occurrence frequencies for the respective entries.
- This applies to $19500$ rows, i.e., $1.8\%$ of the records

In [5]:
nrows_null = df.filter( _.InvoiceNo.is_null() ).shape[0]
nrows_total = df.shape[0]
(nrows_null, nrows_null / nrows_total)

(19500, 0.018269203743781444)

In [6]:
# drop rows with null InvoiceNo
df = df.filter( ~_.InvoiceNo.is_null() )

### 2.2.2 Missing or Incorrect Product Descriptions
- Some product descriptions indicate adjustments or comments rather than product names. In some cases, additional information is provided, and in others, they have been initially mislabeled.
- We'll exclude the corresponding transactions from the data.
- Exploratory analysis showed that nearly all proper product descriptions are uppercase, and those that are not fully uppercase are in title case.
    - Thus, one heuristic to identify anomalous descriptions is calculating the relative number of uppercase letters at the start of a word.

In [7]:
df = ( df
    .with_columns([
        # Extract the count of lowercase letters starting a word and store it in a new column
        _.Description.str.extract_all(r"[^A-Za-z][a-z]").list.len().alias("lowercase_start_count"),
        # Extract the count of uppercase letters starting a word and store it in a new column
        _.Description.str.extract_all(r"[^A-Za-z][A-Z]").list.len().alias("uppercase_start_count")
    ])
    .with_columns([
        # Create a boolean column indicating if there are no letters starting a word
        ( (_.lowercase_start_count == 0) & (_.uppercase_start_count == 0) ).alias("no_letter_start"),
        # Create a boolean column indicating if there are more lowercase letters starting words than uppercase
        (  _.lowercase_start_count > _.uppercase_start_count ).alias("largely_lowercase_start")
    ])
    .with_columns([
        # Create a final boolean column indicating if the description is anomalous
        ( _.largely_lowercase_start | _.no_letter_start | _.Description.is_null() ).alias("is_anomalous_description")
    ])
    # Drop the intermediate columns used for calculations
    .drop(["lowercase_start_count", "uppercase_start_count", "no_letter_start", "largely_lowercase_start"])
)

In [8]:
#  Verify that all descriptions classified as anomalous are so indeed
print(df.filter(_.is_anomalous_description)["Description"].unique().to_list())

['MIA', 'Wet pallet-thrown away', 'samples', 'wrongly coded 20713', 'rusty throw away', 'sold as 22467', 'reverse mistake', '22719', 'wet/rusty', 'Ebay sales by the box.', 'samples/damages', 'Damp and rusty', 'stock creditted wrongly', 'thrown away', 'sold with wrong barcode', 'crushed boxes', '? sold as sets?', '?sold as sets?', "thrown away-can't sell.", 'Found by jackie', 'wrongly marked. 23343 in box', 'smashed', 'cracked', 'damages/credits from ASOS.', 'Adjustment by john on 26/01/2010 16', 'lost in space', 'dirty', '85123a mixed', 'damages', 'gone', 'wrong barcode', 'taig adjust', 'counted', 'Not rcvd in 10/11/2010 delivery', 'sold in wrong qnty', 'sold as 1', 'mystery! Only ever imported 1800', 'sold as gold', 'mailout', 'water damage', 'label mix up', 'mailout addition', 'Mixed with blue', '?? missing', '????damages????', 'wedding co returns?', 'sold as set on dotcom', 'Adjustment', 'lost??', 'Missing', 'checked', 'wrongly coded-23343', 'crushed', 'dotcom sales', 'Sale error', 

In [9]:
# Check all the remaining product descriptions
print(df.filter(~_.is_anomalous_description)["Description"].unique().to_list())

['FUSCHIA RETRO BAR STOOL', 'KITTENS DESIGN FLANNEL', 'TEA TIME PARTY BUNTING', 'HANGING METAL BIRD BATH', 'BUNNY EGG BOX', 'MAGNETS PACK OF 4 HOME SWEET HOME', 'FRENCH BLUE METAL DOOR SIGN 2', 'DECORATIVE ROSE BATHROOM BOTTLE', 'DOILEY BISCUIT TIN', 'ZINC T-LIGHT HOLDER STARS SMALL', 'SILVER HEART COMPACT MIRROR', 'PHOTO FRAME 3 CLASSIC HANGING', 'BELL HEART ANTIQUE GOLD', 'PINK POLKADOT GARDEN PARASOL', 'CHRISTMAS METAL POSTCARD WITH BELLS', 'STARFISH SOAP DISH', 'NUMBER TILE VINTAGE FONT 5', 'LARGE CAKE STAND HANGING HEARTS', 'WHITE ROHMBIC BLOCK TABLE LAMP', 'ANGEL DECORATION 3 BUTTONS ', 'CHILDRENS GARDEN GLOVES PINK', 'TRADITIONAL WOODEN CATCH CUP GAME ', 'WRAP COWBOYS  ', 'RETRO SPOT TEA SET CERAMIC 11 PC ', 'HOLIDAY FUN LUDO', 'EASTER BUNNIES ON A STICK', 'BLACK SILOUETTE CANDLE PLATE', 'MAGIC GARDEN MOUNT FUJI', 'HOME SWEET HOME BLACKBOARD', 'BUNDLE OF 3 SCHOOL EXERCISE BOOKS  ', 'RED POLKADOT COFFEE  MUG', 'RETROSPOT CANDLE  SMALL', 'MOODY BOY  DOOR HANGER ', '3 TRADITIONAl B

- This applies to 4193 rows, i.e., $0.4\%$ of the records

In [10]:
nrows_anomalous_description = df.filter(_.is_anomalous_description).shape[0]
nrows_total = df.shape[0]
(nrows_anomalous_description, nrows_anomalous_description / nrows_total)

(8575, 0.0081832670083121)

In [11]:
# Extract the transaction IDs associated with anomalous descriptions
anomalous_transactions = df.filter(_.is_anomalous_description).select(_.InvoiceNo).with_columns([pl.lit(True).alias("is_anomalous_invoice")])

df = (df
    # Join the main dataframe with the anomalous transactions to mark them
    .join(anomalous_transactions, on="InvoiceNo", how="left")
    # Filter out the rows where the invoice is marked as anomalous
    .filter(_.is_anomalous_invoice.is_null())
    # Drop the columns used for marking and filtering
    .drop(["is_anomalous_description", "is_anomalous_invoice"])
)

### 2.3 Separate Transaction and Product Information
- Here, we separate the transaction and product information

In [12]:
transactions_df = df.select(_.InvoiceNo, _.StockCode)
products_df = df.select(_.StockCode, _.Description).unique()

## 3. Product Description Embeddings
### 3.1 Create Unique Product Descriptions

In [13]:
# Group by StockCode and count the occurrences, then sort by count in descending order and get the list of StockCodes
stock_codes = (products_df
    .group_by(_.StockCode)
    .count()
    .sort(_.count, descending=True)
)["StockCode"].to_list()

# Create a pivot table with StockCode as the index and Description as the values
unique_product_descriptions = (products_df
    # Add a row index for each StockCode
    .with_columns( pl.arange(0, pl.count()).over("StockCode").alias("row_idx"))
    .pivot( values="Description", index="row_idx", on="StockCode", aggregate_function="first")
)[stock_codes]



In [14]:
# Display the pivot table
unique_product_descriptions

22346,21955,22345,22384,22344,20685,23236,23196,22197,22845,23209,23535,22200,22343,20750,22853,22383,21524,84997D,23366,22416,21523,23231,84997b,22937,22356,22776,22950,23244,84997d,84997c,22785,23370,84509c,22740,22333,23240,…,21695,22519,23692,35822B,84723,90089,85119,20848,72779,21638,21374,21356,85064,84818,47021G,35637A,21253,82494L,84799,84800M,85049A,21676,22694,90059D,84012,90198A,21900,20897,21801,47598,84539,35597A,84614C,21689,84743C,90077,84821
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,…,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""PARTY PIZZA DISH GREEN RETROSP…",""" DOORMAT UNION JACK GUNS AND …","""PARTY PIZZA DISH BLUE RETROSPO…","""LUNCHBAG PINK RETROSPOT""","""PARTY PIZZA DISH PINK+WHITE SP…","""DOORMAT RED RETROSPOT""","""STORAGE TIN VINTAGE DOILEY ""","""RETRO LEAVES MAGNETIC NOTEPAD""","""POPCORN HOLDER , SMALL ""","""VINTAGE CREAM CAT FOOD CONTAIN…","""LUNCH BAG VINTAGE DOILY ""","""WALL ART BICYCLE SAFETY""","""FRYING PAN PINK POLKADOT""","""PARTY PIZZA DISH RED WHITE SPO…",""" RED/WHITE DOT MINI CASES""","""ENAMEL CAT BOWL CREAM""","""LUNCHBAG SUKI DESIGN ""","""SPOTTY HOME SWEET HOME DOORMA…","""PINK 3 PIECE POLKADOT CUTLERY …","""SET 12 COLOURING PENCILS DOILY""","""SET OF 36 SPACEBOY PAPER DOILI…","""DOOR MAT FANCY FONT HOME SWEET…","""WRAP VINTAGE DOILEY ""","""CHILDRENS CUTLERY RETROSPOT RE…","""BAKING MOULD CHOCOLATE CUP CAK…","""CHARLOTTE BAG PINK POLKADOT""","""CAKESTAND, 3 TIER, LOVEHEART""","""SET OF 36 DOILIES VINTAGE CHRI…","""ROUND STORAGE TIN VINTAGE LEAF""","""CHILDRENS CUTLERY POLKADOT PIN…","""BLUE 3 PIECE POLKADOT CUTLERY …","""CUSHION COVER PINK UNION FLAG""","""SET 36 COLOURING PENCILS DOILY""","""DOTTY PLACEMATS ""","""POLKA DOT PEN""","""RETROSPOT PARTY BAG + STICKER …","""SET OF 4 KNICK KNACK TINS DOI…",…,"""SMALL SILVER FLOWER CANDLE POT""","""CHILDS GARDEN BRUSH PINK""","""WRAP A PRETTY THANK YOU""","""ACRYLIC BEAD CHAIN, BLUE""","""CHERRY BLOSSOM CANVAS ART PICT…","""PINK CRYSTAL SKULL PHONE CHARM""","""WATERING CAN SINGLE HOOK PISTA…","""ZINC HEART LATTICE CHARGER SMA…","""SET/6 BLACK ROSE T-LIGHT CANDL…","""ASSORTED TUTTI FRUTTI NOTEBOOK""","""MIRRORED WALL ART SKULLS""","""TOAST ITS - FAIRY FLOWER""","""CREAM SWEETHEART LETTER RACK""","""DANISH ROSE PHOTO FRAME""","""SET/6 BEAD COASTERS GAUZE BAG …","""IVORY STRING CURTAIN WITH POLE…","""SET OF PICTURE FRAME STICKERS""","""WOODEN FRAME ANTIQUE WHITE ""","""SPRIG LAVENDER ARTIFICIAL FLOW…","""MEDIUM WHITE/PINK ROSE ART FLO…","""TRADITIONAL CHRISTMAS RIBBONS""","""FLOWERS STICKERS""","""WICKER STAR ""","""DIAMANTE HAIR GRIP PACK/2 PERI…","""MAGIC SHEEP WOOL GROWING FROM …","""VINTAGE ROSE BEAD BRACELET RAS…","""KEY FOB , SHED""","""VINTAGE NOTEBOOK PARIS DAYS""","""CHRISTMAS TREE DECORATION WITH…","""NEW ENGLAND EGG WARMER""","""KNITTED RABBIT DOLL ""","""DUSTY PINK CHRISTMAS TREE 30CM""","""BLUE BAROQUE FLOCK CANDLE HOLD…","""SILVER VANILLA FLOWER CANDLE …","""ORANGE FELT VASE + FLOWERS""","""BLACK DIAMOND CLUSTER EARRINGS""","""DANISH ROSE DELUXE COASTER"""
"""PARTY PIZZA DISH GREEN+WHITE S…","""DOORMAT UNION JACK GUNS AND RO…","""PARTY PIZZA DISH BLUE POLKADOT""","""LUNCH BAG PINK RETROSPOT""","""PARTY PIZZA DISH PINK POLKADOT""","""RED SPOTTY COIR DOORMAT""","""DOILEY STORAGE TIN""","""RETO LEAVES MAGNETIC SHOPPING …","""POPCORN HOLDER""","""VINTAGE CAT FOOD CONTAINER""","""LUNCH BAG DOILEY PATTERN ""","""WALL ART BICYCLE SAFTEY ""","""FRYING PAN PINK POLKADOT ""","""PARTY PIZZA DISH RED RETROSPOT""","""RED RETROSPOT MINI CASES""","""CAT BOWL, ENAMEL , CREAM COLOU…","""LUNCH BAG SUKI DESIGN ""","""DOOR MAT SPOTTY HOME SWEET HOM…","""PINK 3 PIECE MINI DOTS CUTLERY…","""SET 12 COLOUR PENCILS DOILEY""","""SET OF 36 DOILIES SPACEBOY DES…","""FANCY FONT HOME SWEET HOME DOO…","""WRAP DOILEY DESIGN""","""RED 3 PIECE MINI DOTS CUTLERY …","""BAKING MOULD CUPCAKE CHOCOLATE""","""CHARLOTTE BAG , PINK/WHITE SPO…","""SWEETHEART CAKESTAND 3 TIER""","""SET OF 36 VINTAGE CHRISTMAS DO…","""CANNISTER VINTAGE LEAF DESIGN""","""PINK 3 PIECE POLKADOT CUTLERY …","""BLUE 3 PIECE MINI DOTS CUTLERY…","""SQUARECUSHION COVER PINK UNION…","""SET 36 COLOUR PENCILS DOILEY""","""SET OF 4 POLKADOT PLACEMATS ""","""POLKADOT PENS""","""RETRO SPOT PARTY BAG + STICKER…","""SET OF 4 KNICK KNACK TINS DOIL…",…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""PARTY PIZZA DISH GREEN POLKADO…","""UNION JACK GUNS & ROSES DOORM…","""PARTY PIZZA DISH BLUE+WHITE SP…","""LUNCH BAG PINK POLKADOTS""","""PARTY PIZZA DISH PINK WHITE SP…","""DOOR MAT RED SPOT""","""DOILEY BISCUIT TIN""","""VINTAGE LEAF MAGNETIC NOTEPAD""","""SMALL POPCORN HOLDER""","""CAT FOOD CONTAINER , VINTAGE""","""LUNCH BAG VINTAGE DOILEY ""","""BICYCLE SAFTEY WALL ART""","""FRYING PAN PINK RETROSPOT""","""PARTY PIZZA DISH RED+WHITE SPO…","""RED/WHITE DOT MINI CASES""","""CAT BOWL VINTAGE CREAM""","""LUNCH BAG SUKI DESIGN ""","""DOORMAT SPOTTY HOME SWEET HOME""","""CHILDRENS CUTLERY POLKADOT PIN…","""SET 12 COLOURING PENCILS DOILE…","""36 DOILIES SPACEBOY DESIGN ""","""DOORMAT FANCY FONT HOME SWEET …","""WRAP VINTAGE DOILY ""","""RED 3 PIECE RETROSPOT CUTLERY …","""BAKING MOULD CHOCOLATE CUPCAKE…","""CHARLOTTE BAG PINK WITH WHITE …","""SWEETHEART 3 TIER CAKE STAND ""","""36 DOILIES VINTAGE CHRISTMAS""","""STORAGE TIN VINTAGE LEAF""","""PINK 3 PIECE MINI DOTS CUTLERY…","""CHILDRENS CUTLERY POLKADOT BLU…","""SQUARECUSHION COVER PINK UNION…","""SET 36 COLOURING PENCILS DOILE…","""SET OF 4 DOTTY PLACEMATS ""","""POLKADOT PEN""","""RETRO SPORT PARTY BAG + STICKE…","""SET OF 4 KNICK KNACK TINS DOIL…",…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""PARTY PIZZA DISH GREEN WHITE S…","""DOOR MAT UNION JACK GUNS AND R…","""PARTY PIZZA DISH BLUE WHITE SP…","""LUNCH BAG PINK POLKADOT""","""PARTY PIZZA DISH PINK RETROSPO…","""DOORMAT RED SPOT""","""STORAGE TIN VINTAGE DOILY ""","""LEAVES MAGNETIC SHOPPING LIST""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [15]:
product_descriptions = products_df.group_by("StockCode").agg(
    pl.col("Description").str.concat(" / ").alias("Description")
)

In [16]:
product_descriptions

StockCode,Description
str,str
"""85230D""","""VANILLA SCENTED VOTIVE CANDLE"""
"""85127""","""SMALL SQUARE CUT GLASS CANDLES…"
"""22809""","""SET OF 6 T-LIGHTS SANTA"""
"""84437A""","""S/6 PINK EASTER DECS IN BOX"""
"""23522""","""DOG AND BALL WALL ART / WALL A…"
…,…
"""85176""","""SEWING SUSAN 21 NEEDLE SET"""
"""85049a""","""TRADITIONAL CHRISTMAS RIBBONS"""
"""84820""","""DANISH ROSE TRINKET TRAYS"""
"""84611B""","""BLACK NEW BAROQUE FLOCK BOX"""


### 3.2 Create Description Embeddings


In [17]:
def encode_semantic_embeddings(description, prompt = None, fname_cache = None):
  """ """
  if (not fname_cache is None) and os.path.exists(fname_cache):
      embeddings = None
      with open(fname_cache, "rb") as f:
        embeddings = pickle.load(f)
      return embeddings

  py_model = st.SentenceTransformer(py_model_name)
  
  prompts = [f"passage: Product: '{t}'" for t in description]
  
  embeddings = py_model.encode(
      sentences = prompts, 
      prompt = prompt,
      show_progress_bar = True,
      use_multiprocessing=True,
      normalize_embeddings = False # note: apparently dot products can be used instead of cosine sim, if normalized
  )

  if (not fname_cache is None):
      with open(fname_cache, "wb") as f:
          pickle.dump(embeddings, f)

  return embeddings

In [18]:
fname_dkt_embeddings = "./analysis1_embeddings_semantic.pkl"

product_ids = product_descriptions["StockCode"].to_list()
description = product_descriptions["Description"].to_list()

#description = [
#    "buy", "purchase",
#    "cold", "chilly",
#    "like", "enjoy",
#    "finish", "complete",
#    "meeting", "appointment"
#]
#product_ids = [str(product_id) for product_id in range(0, 10)]

embeddings_semantic = encode_semantic_embeddings(description, fname_cache = fname_dkt_embeddings)

In [19]:
## 4. Product Co-Occurrence Embeddings

- similar (almost the same?) as product2vec: https://github.com/TheoVall-DS/product2vec/ 

In [20]:
transactions = transactions_df.group_by("InvoiceNo").agg(
    pl.col("StockCode").cast(pl.Utf8).alias("products")
)["products"].to_list()

In [21]:
def resample_transactions(transactions):
    """Resample and shuffle transactions to create a new dataset."""
    
    # Define a lambda function to shuffle a list
    shuffle = lambda lst: random.sample(lst, k=len(lst))

    # Resample the transactions to create a new dataset
    resampled_transactions = shuffle(transactions)  # Resample with replacement
    shuffled_transactions = [shuffle(transaction) for transaction in resampled_transactions]  # Shuffle the sentences

    return shuffled_transactions

In [22]:
def encode_transactional_embeddings(product_ids, transactions, embedding_size = 100, window = 5, min_count = 1, n_bootstrap = 50, fname_cache = None):
    """ """

    if (not fname_cache is None) and os.path.exists(fname_cache):
        embeddings = None
        with open(fname_cache, "rb") as f:
            bootstrap_embeddings, embeddings = pickle.load(f)
        return np.array( embeddings )

    # Number of resamples
    bootstrap_embeddings = []

    for _ in tqdm( range(n_bootstrap) ):
        resampled_transactions = resample_transactions(transactions)  # Resample with replacement and re-shuffle the transactions
        model = Word2Vec(resampled_transactions, vector_size=embedding_size, window=window, min_count=min_count, sg=0, workers=4)
        bootstrap_embeddings.append({word: model.wv[word] for word in model.wv.index_to_key})
        
    if (not fname_cache is None):
        with open(fname_cache, "wb") as f:
            pickle.dump(bootstrap_embeddings, f)

    bootstrap_embeddings = [ defaultdict(lambda: np.zeros(embedding_size), d) for d in bootstrap_embeddings ]

    embeddings = []
    for product_id in tqdm( product_ids ):
        product_embeddings = [embedding[product_id] for embedding in bootstrap_embeddings]
        aggregated_embedding = np.mean(product_embeddings, axis=0)
        embeddings.append( aggregated_embedding )

    bootstrap_embeddings = [ dict(d) for d in bootstrap_embeddings ]
    if (not fname_cache is None):
        with open(fname_cache, "wb") as f:
            pickle.dump((bootstrap_embeddings, embeddings), f)

    return np.array( embeddings )

In [23]:
%%time
transactional_embedding_size = 100
transactional_window = 5
transactional_min_count = 1
transactional_n_bootstrap = 2

fname_dkt_embeddings = "./analysis1_embeddings_transactions.pkl"
embeddings_transactional = encode_transactional_embeddings( product_ids, transactions, embedding_size = transactional_embedding_size, 
                                                            window = transactional_window, min_count = transactional_min_count, 
                                                            n_bootstrap = transactional_n_bootstrap, fname_cache = fname_dkt_embeddings)

CPU times: user 135 ms, sys: 7.87 ms, total: 143 ms
Wall time: 140 ms


## 5. Fused Embeddings


In [24]:
def normalize_embeddings(embeddings):
  """ """
  return embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

In [25]:
norm_embeddings_semantic = normalize_embeddings(embeddings_semantic)
norm_embeddings_transactional = normalize_embeddings(embeddings_transactional)
embeddings = np.column_stack((norm_embeddings_semantic, norm_embeddings_transactional))
embeddings = norm_embeddings_semantic
embeddings.shape

(4896, 768)

## 6. Similarity Quantification


In [32]:
sim = cosine_similarity(embeddings, embeddings)
#sim = (sim - np.min(sim) ) / ( 1 - np.min(sim) )
#sim = np.clip(sim, a_min=0, a_max=1)

distance = 1 - sim

from scipy.spatial.distance import squareform
condensed_distance = squareform(distance, checks=False)

from scipy.cluster.hierarchy import linkage, dendrogram

Z = linkage(condensed_distance, method='average')  # or 'average', 'complete', etc.
Z = np.clip(Z, a_min=0, a_max=None)

In [None]:

def compute_homogeneity(indices, similarity_matrix):
    similarity_submatrix = similarity_matrix[np.ix_(indices, indices)]
    homogeneity = np.min( similarity_submatrix )
    return float(homogeneity)

def collect_leaf_indices(node):
    if node.is_leaf():
        return [node.id]
    else:
        return collect_leaf_indices(node.left) + collect_leaf_indices(node.right)

def build_tree(node, labels, similarity_matrix):
    """ 
    """

    if node.is_leaf():
        return [node.id], { # , 
            "name": f"{labels[node.id]}",
            "homogeneity": 0.0  # Leaves have no homogeneity
        }

    else:
        left_leaves, left_branch = build_tree(node.left, labels, similarity_matrix)
        right_leaves, right_branch = build_tree(node.right, labels, similarity_matrix)
        leaves = left_leaves + right_leaves
        
        homogeneity = compute_homogeneity(leaves, similarity_matrix)
        if not left_branch["homogeneity"]:
            left_branch["homogeneity"] = homogeneity

        if not right_branch["homogeneity"]:
            right_branch["homogeneity"] = homogeneity

        return leaves, { 
            "name": f"[{round(100*homogeneity)}]", # "", # or str(node.id)
            "homogeneity": homogeneity,  # Proxy: deeper == more homogeneous
            "children": [ left_branch, right_branch ]
        }


scaled_sim = ( sim - np.min(sim) ) / ( 1 - np.min(np.min(sim)) )

root_node = to_tree(Z)
#x = root_node.right.right.right.left.right
leaves, tree_json = build_tree(root_node, description, sim)
#print("---")
#print(leaves)
#print(tree_json)

with open('tree.json', 'w') as f:
    json.dump(tree_json, f)

In [None]:
import http.server
import socketserver
import threading
from IPython.display import IFrame, display

PORT = 8005
DIRECTORY = '.'  # или укажи подкаталог

# Поднимаем HTTP сервер в отдельном потоке
Handler = http.server.SimpleHTTPRequestHandler
httpd = socketserver.TCPServer(("", PORT), Handler)

def serve():
    import os
    os.chdir(DIRECTORY)
    httpd.serve_forever()

thread = threading.Thread(target=serve)
thread.daemon = True
thread.start()


192.168.0.233 - - [23/Mar/2025 13:00:37] "GET / HTTP/1.1" 200 -
192.168.0.233 - - [23/Mar/2025 13:00:37] "GET /view_treesx.html HTTP/1.1" 200 -
192.168.0.233 - - [23/Mar/2025 13:00:38] code 404, message File not found
192.168.0.233 - - [23/Mar/2025 13:00:38] "GET /favicon.ico HTTP/1.1" 404 -
192.168.0.233 - - [23/Mar/2025 13:00:42] "GET /view_treesx.html HTTP/1.1" 200 -


In [51]:
import webbrowser
webbrowser.open(f'http://192.168.0.101:{PORT}/view_treesx.html')


True

In [None]:
tree_json_x = tree_json