In [139]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np

pd.set_option("display.width", 1000)
pd.set_option("display.max_rows", 500)

types = {
    "Invoice/Item Number": object,
    "Date": object,
    "Store Number": int,
    "Store Name": object,
    "Address": object,
    "City": object,
    "Zip Code": object,
    "Store Location": object,
    "County Number": object,
    "County": object,
    "Category": float,
    "Category Name": object,
    "Vendor Number": float,
    "Vendor Name": object,
    "Item Number": object,
    "Item Description": object,
    "Pack": int,
    "Bottle Volume (ml)": int,
    "State Bottle Cost": float,
    "State Bottle Retail": float,
    "Bottles Sold": int,
    "Sale (Dollars)": float,
    "Volume Sold (Liters)": float,
    "Volume Sold (Gallons)": float,
}

df = pd.read_csv("Iowa_Liquor_Sales.csv", dtype=types, parse_dates=["Date"])
df_names = df[
    ["Item Number", "Item Description", "Pack", "Bottle Volume (ml)", "Category Name"]
].drop_duplicates(keep="first", subset=["Item Number"])

In [146]:
df_names = df_names.fillna("-")
df_names["Item Number"] = pd.to_numeric(
    df_names["Item Number"], errors="coerce", downcast="integer"
)

In [216]:
print(df_names)

          Item Number                            Item Description  Pack  Bottle Volume (ml)                 Category Name
0              5326.0                        JOHNNIE WALKER BLACK    12                 750               SCOTCH WHISKIES
1             36447.0                                    INGENIOZ    12                 750               AMERICAN VODKAS
2             43128.0                        BACARDI SUPERIOR RUM     6                1750                     WHITE RUM
3             52594.0                             E & J VS BRANDY    24                 375             AMERICAN BRANDIES
4             36308.0                               HAWKEYE VODKA     6                1750               AMERICAN VODKAS
...               ...                                         ...   ...                 ...                           ...
25924217     965424.0                   LICOR 43 ORIGINAL LIQUEUR    12                 375  IMPORTED CORDIALS & LIQUEURS
25925624     901993.0  H

In [3]:
df["Item Number"] = pd.to_numeric(
    df["Item Number"], errors="coerce").dropna()

In [273]:
pivotDf = (
    pd.pivot_table(
        df,
        index="Store Number",
        columns="Item Number",
        values="Bottles Sold",
        aggfunc=lambda x: len(x.unique()),
    )
    .reset_index()
    .fillna(0)
    .drop("Store Number", axis=1)
)

print(pivotDf)


Item Number  100001  100005  100006  100015  100017  100018  100022  100024  100025  100026  ...  999935  999936  999937  999938  999991  999992  999993  999994  999995  x904631
0               0.0     0.0     0.0     1.0     0.0     2.0     0.0     1.0     0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0      0.0
1               0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0      0.0
2               0.0     0.0     0.0     1.0     0.0     0.0     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0      0.0
3               0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0      0.0
4               0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0  ...     0.0     0

In [111]:
from numpy.linalg import svd

matrix = pivotDf.values
u, s, vh = svd(matrix, full_matrices=False)
ELASTIC_SEARCH_MAX_DIM = 1024
sliced = vh.T[:, :ELASTIC_SEARCH_MAX_DIM]

In [275]:
from elasticsearch import Elasticsearch

client = Elasticsearch("http://localhost:9200")
print(client.info()['tagline'])

You Know, for Search


In [149]:
# 'Item Number','Item Description','Pack','Bottle Volume (ml)','Category Name'

request_body = {
    "mappings": {
        "properties": {
            "productCode": {"type": "integer"},
            "embed": {
                "type": "dense_vector",
                "dims": 1024,
                "index": True,
                "similarity": "cosine",
            },
            "name": {"type": "text"},
            "pack": {"type": "integer"},
            "vol_ml": {"type": "integer"},
            "category": {"type": "text"},
        }
    }
}
print("creating index...")
client.indices.create(index="liquoridx", body=request_body)

creating index...


  client.indices.create(index = 'liquoridx', body = request_body)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'liquoridx'})

In [153]:
for embed, prodCode in zip(sliced, pivotDf.columns.to_list()):
    sliceOfMeta = df_names[df_names["Item Number"] == prodCode]

    doc = {
        "productCode": prodCode,
        "embed": embed,
        "name": sliceOfMeta["Item Description"].iloc[0],
        "pack": sliceOfMeta["Pack"].iloc[0],
        "vol_ml": sliceOfMeta["Bottle Volume (ml)"].iloc[0],
        "category": sliceOfMeta["Category Name"].iloc[0],
    }

    try:
        client.index(index="liquoridx", document=doc)
    except Exception as e:
        print(prodCode, e)

938645.0 BadRequestError(400, 'mapper_parsing_exception', 'failed to parse')


In [159]:
client.indices.refresh(index="liquoridx")

ObjectApiResponse({'_shards': {'total': 2, 'successful': 1, 'failed': 0}})

In [227]:
# Fuzzy search

body = {"match": {"name": {"query": "blueberry", "fuzziness": "AUTO"}}}


find = client.search(
    index="liquoridx", query=body, source=["productCode", "name", "embed"]
)
print(find)

for hit in find.body["hits"]["hits"]:
    print(hit)

{'took': 28, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 49, 'relation': 'eq'}, 'max_score': 6.954382, 'hits': [{'_index': 'liquoridx', '_id': 'ib8-IIkBtDQXcKWm_A9O', '_score': 6.954382, '_source': {'productCode': 935222.0, 'name': 'PINNACLE BLUEBERRY', 'embed': [-0.00029347377741811766, -0.0007943950403832184, -0.0006533305437304715, 0.0008327867540836023, 0.0021420240565729644, 0.0009403576764772977, 9.103999828858531e-05, -0.0031904508375811174, 0.004407414984448586, -0.0017808260803417707, 0.0020034545230277864, 0.0008784904103179208, 0.0016902504166619583, -0.0010726218059695575, 0.0008156191754869137, -0.0035203429867610024, 6.892081100468152e-07, 0.00031242922951757555, -0.001999935253000906, 0.00017540398615952326, 0.003420085770059169, 0.0021707678134135766, -0.005208276016434731, -0.001906936973064642, 0.0036099440611640846, 0.008836115958459831, -0.0031970921175013423, -0.002465885687443117, 0.002296345

In [272]:
# Vector search


# first find the id
searchWord = "Finlandia Rasberry"

body = {"match": {"name": {"query": searchWord, "fuzziness": "AUTO"}}}
find = client.search(index="liquoridx", query=body, source=["productCode", "name"])

topMatchCode = int(find.body["hits"]["hits"][0]["_source"]["productCode"])

# retrieve the embedding for that id

body = {"match": {"productCode": {"query": topMatchCode}}}
find = client.search(
    index="liquoridx", query=body, source=["productCode", "name", "embed"])

embeddingArr = find.body["hits"]["hits"][0]["_source"]["embed"]

# Similarity search


body = {
    "field": "embed",
    "query_vector": embeddingArr,
    "k": 10,
    "num_candidates": 30
}

find = client.search(
    index="liquoridx",
    source=["productCode", "name"],
    knn= body,
)


for hit in find.body["hits"]["hits"]:
    print(hit)

{'_index': 'liquoridx', '_id': '374-IIkBtDQXcKWmdPGP', '_score': 1.0, '_source': {'productCode': 34318.0, 'name': 'FINLANDIA REDBERRY'}}
{'_index': 'liquoridx', '_id': 'RL4-IIkBtDQXcKWmdvJi', '_score': 0.82284683, '_source': {'productCode': 35136.0, 'name': 'FINLANDIA GRAPEFRUIT'}}
{'_index': 'liquoridx', '_id': '4L4-IIkBtDQXcKWmdPGS', '_score': 0.7733009, '_source': {'productCode': 34326.0, 'name': 'FINLANDIA IMP VODKA 80PRF'}}
{'_index': 'liquoridx', '_id': 'y78_IIkBtDQXcKWmExTx', '_score': 0.63860065, '_source': {'productCode': 977976.0, 'name': 'TATTERSALL AQUAVIT'}}
{'_index': 'liquoridx', '_id': '774-IIkBtDQXcKWmmfnM', '_score': 0.6330812, '_source': {'productCode': 64343.0, 'name': 'BADEL LAVOV'}}
{'_index': 'liquoridx', '_id': '4r8-IIkBtDQXcKWmzARe', '_score': 0.61752665, '_source': {'productCode': 901297.0, 'name': 'GLENROTHES 1995'}}
{'_index': 'liquoridx', '_id': 'GL8_IIkBtDQXcKWmGRar', '_score': 0.61632067, '_source': {'productCode': 987089.0, 'name': 'CEDAR RIDGE BARREL PR