In [83]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np

pd.set_option("display.width", 1000)
pd.set_option("display.max_rows", 500)

types = {
    "Invoice/Item Number": object,
    "Date": object,
    "Store Number": int,
    "Store Name": object,
    "Address": object,
    "City": object,
    "Zip Code": object,
    "Store Location": object,
    "County Number": object,
    "County": object,
    "Category": float,
    "Category Name": object,
    "Vendor Number": float,
    "Vendor Name": object,
    "Item Number": object,
    "Item Description": object,
    "Pack": int,
    "Bottle Volume (ml)": int,
    "State Bottle Cost": float,
    "State Bottle Retail": float,
    "Bottles Sold": int,
    "Sale (Dollars)": float,
    "Volume Sold (Liters)": float,
    "Volume Sold (Gallons)": float,
}

df = pd.read_csv("Iowa_Liquor_Sales.csv", dtype=types, parse_dates=["Date"])
df["Item Number"] = pd.to_numeric(
    df["Item Number"], errors="coerce", downcast="integer"
)
df = df.dropna(subset=["Item Number"])

df_itemNames = df[
    ["Item Number", "Item Description", "Pack", "Bottle Volume (ml)", "Category Name"]
].drop_duplicates(keep="first", subset=["Item Number"]).fillna("-")


df_itemNames["Item Number"] = pd.to_numeric(
    df_itemNames["Item Number"], errors="coerce", downcast="integer"
)

df_storeNames = df[["Store Number","Store Name", "Store Location", "City"]].drop_duplicates(keep="first", subset=["Store Number"])
df_storeNames["Store Number"] = pd.to_numeric(
    df_storeNames["Store Number"], errors="coerce", downcast="integer"
)



In [70]:
import warnings
warnings.filterwarnings('ignore')

In [94]:

itemByStorePivot = pd.pivot_table(df, values='Volume Sold (Liters)', index='Item Number', columns='Store Number', aggfunc='sum')

def busiestStore(x):
    
    try:
        return itemByStorePivot[itemByStorePivot.index == x.iloc[0]].idxmax(axis=1).iloc[0].astype(int)
        
    except Exception as e:
        print(x)
        print(e)
        return 0


df['busiestStore'] = df['Item Number']
itemStats = pd.pivot_table(
    df,
    index=['Item Number'],
    aggfunc={'Store Number': lambda x: len(x.unique()),
        'Volume Sold (Liters)':np.sum,
        'State Bottle Retail':np.average,
        'busiestStore': busiestStore,
        }
    ).reset_index()

itemStats.columns = ['item_number', 'price_avg_usd', 'store_count', 'sold_liters', 'busiest_store_id']


itemStats = pd.merge(itemStats, df_storeNames, left_on='busiest_store_id', right_on='Store Number', how='left')\
    .drop(columns=['Store Number', 'Store Location'])\
        .rename(columns={'Store Name':'store_name', 'City':'store_city'})
        
itemStats['store_name'] = itemStats['store_name'].str.split(' / ').str[0]


In [96]:
from DataLink import DataLink


link = DataLink("username123", "password123", "db")


# df.set_index('Item Number').to_sql("liquor", link.engine, if_exists="replace", index=True, index_label=['Item Number'], chunksize=1_000_000)
itemStats.set_index('item_number').to_sql("liquor_store_stats", link.engine, if_exists="replace", index=True, index_label=['item_number'], chunksize=1_000_000)
# link.executeRaw('ALTER TABLE liquor ADD PRIMARY KEY ("Invoice/Item Number")')


598

In [16]:
pivotDf = (
    pd.pivot_table(
        df,
        index="Store Number",
        columns="Item Number",
        values="Bottles Sold",
        aggfunc=lambda x: x.sum(),
    )
    .reset_index()
    .fillna(0)
    .drop("Store Number", axis=1)
)
print(pivotDf.shape)

(2952, 12598)


In [45]:
from numpy.linalg import svd

matrix = pivotDf.values
u, s, vh = svd(matrix, full_matrices=False)
k = 1024
U = u[:, :k]
VT = vh[:, :k]

In [48]:
print(pd.DataFrame(VT).head())
print(pd.DataFrame(VT).shape)
print(VT[0].shape)


           0             1             2             3             4         5         6         7             8             9     ...      1014      1015      1016      1017      1018      1019      1020      1021      1022      1023
0 -8.968078e-07 -3.372753e-06 -5.527936e-06 -1.275140e-06 -9.218630e-07 -0.000004 -0.000039 -0.000006 -7.767270e-07 -1.214606e-06  ... -0.000003 -0.000148 -0.011214 -0.011137 -0.000818 -0.000743 -0.000091 -0.002868 -0.000005 -0.000017
1  6.439515e-06  1.434550e-05  1.426490e-05  2.948490e-06  5.467430e-06  0.000046  0.000218  0.000002  5.197317e-06  8.000281e-06  ...  0.000017  0.000678  0.012070 -0.003141  0.003049  0.000758  0.000218  0.008297  0.000010  0.000015
2 -4.227308e-06 -8.268627e-06 -6.766992e-06 -2.230809e-06 -5.005382e-06 -0.000021 -0.000259  0.000004 -3.067320e-06 -3.023768e-06  ... -0.000016 -0.000287 -0.023393  0.009255 -0.016606 -0.000127 -0.000105 -0.004608 -0.000008 -0.000069
3 -6.828956e-07  2.311354e-07 -9.773650e-06 -2.553841e-06 -5

In [49]:
from elasticsearch import Elasticsearch

client = Elasticsearch("http://localhost:9200")
print(client.info()['tagline'])

You Know, for Search


In [None]:
# 'Item Number','Item Description','Pack','Bottle Volume (ml)','Category Name'

request_body = {
    "mappings": {
        "properties": {
            "productCode": {"type": "integer"},
            "embed": {
                "type": "dense_vector",
                "dims": 1024,
                "index": True,
                "similarity": "cosine",
            },
            "name": {"type": "text"},
            "pack": {"type": "integer"},
            "vol_ml": {"type": "integer"},
            "category": {"type": "text"},
        }
    }
}
print("creating index...")
client.indices.create(index="liquoridx", body=request_body)

In [51]:
for embed, prodCode in zip(VT, pivotDf.columns.to_list()):
    sliceOfMeta = df_names[df_names["Item Number"] == prodCode]
    
    # print(sliceOfMeta)
    
    
    doc = {
        "productCode": prodCode,
        "embed": embed,
        "name": sliceOfMeta["Item Description"].iloc[0],
        "pack": sliceOfMeta["Pack"].iloc[0],
        "vol_ml": sliceOfMeta["Bottle Volume (ml)"].iloc[0],
        "category": sliceOfMeta["Category Name"].iloc[0],
    }

    try:
        client.index(index="liquoridx", document=doc)
    except Exception as e:
        print(prodCode, e)

In [52]:
client.indices.refresh(index="liquoridx")

ObjectApiResponse({'_shards': {'total': 2, 'successful': 1, 'failed': 0}})

In [25]:
# Fuzzy search

body = {"match": {"name": {"query": "blueberry", "fuzziness": "AUTO"}}}


find = client.search(
    index="liquoridx", query=body, source=["productCode", "name", "embed"]
)
print(find)

for hit in find.body["hits"]["hits"]:
    print(hit)

{'took': 82, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 50, 'relation': 'eq'}, 'max_score': 7.012623, 'hits': [{'_index': 'liquoridx', '_id': 'nDSgNIkBiSuT_TsXu5Qe', '_score': 7.012623, '_source': {'productCode': 35222.0, 'name': 'PINNACLE BLUEBERRY', 'embed': [-6.007130881505017e-05, 8.718522965993515e-05, 1.5412704989942977e-05, 4.393876112775978e-05, 5.548790599495699e-05, 0.00017253584036600544, -6.799413189969886e-05, 9.651742310353228e-05, 0.0002477206639189025, 0.00024039873803653603, -0.00023880153024299713, 7.634321733114543e-05, -0.0002435157519926466, -0.00013788961880909523, -6.357483096562918e-05, -1.2195977130114547e-05, -0.00024173674675377946, -0.00037784668111098775, 0.00043949574147313564, 0.0001507316018661626, 0.00015870928099699172, 6.875016991672045e-05, 0.00010579772020723696, 0.00028555014156930866, -0.00021217463642070816, -0.00017828963177561128, -0.00011014662583577074, 0.00010462603110

In [26]:
# Vector search


# first find the id
searchWord = "Finlandia Rasberry"

body = {"match": {"name": {"query": searchWord, "fuzziness": "AUTO"}}}
find = client.search(index="liquoridx", query=body, source=["productCode", "name"])

topMatchCode = int(find.body["hits"]["hits"][0]["_source"]["productCode"])

# retrieve the embedding for that id

body = {"match": {"productCode": {"query": topMatchCode}}}
find = client.search(
    index="liquoridx", query=body, source=["productCode", "name", "embed"])

embeddingArr = find.body["hits"]["hits"][0]["_source"]["embed"]

# Similarity search


body = {
    "field": "embed",
    "query_vector": embeddingArr,
    "k": 10,
    "num_candidates": 30
}

find = client.search(
    index="liquoridx",
    source=["productCode", "name"],
    knn= body,
)


for hit in find.body["hits"]["hits"]:
    print(hit)

{'_index': 'liquoridx', '_id': 'MDSgNIkBiSuT_TsXpJTQ', '_score': 1.0, '_source': {'productCode': 34318.0, 'name': 'FINLANDIA REDBERRY'}}
{'_index': 'liquoridx', '_id': 'lTSgNIkBiSuT_TsXuZSi', '_score': 0.7601434, '_source': {'productCode': 35136.0, 'name': 'FINLANDIA GRAPEFRUIT'}}
{'_index': 'liquoridx', '_id': 'nTShNIkBiSuT_TsXXpcq', '_score': 0.7337059, '_source': {'productCode': 40638.0, 'name': "BURNETT'S SUGAR COOKIE"}}
{'_index': 'liquoridx', '_id': '3DShNIkBiSuT_TsXa5do', '_score': 0.7337059, '_source': {'productCode': 41219.0, 'name': "BURNETT'S PEAR VODKA"}}
{'_index': 'liquoridx', '_id': '3zShNIkBiSuT_TsXa5f-', '_score': 0.7337059, '_source': {'productCode': 41222.0, 'name': "BURNETT'S POMEGRANATE"}}
{'_index': 'liquoridx', '_id': '4DShNIkBiSuT_TsXbJc6', '_score': 0.7337059, '_source': {'productCode': 41225.0, 'name': "BURNETT'S STRAWBERRY"}}
{'_index': 'liquoridx', '_id': '4TShNIkBiSuT_TsXbJds', '_score': 0.7337059, '_source': {'productCode': 41270.0, 'name': "BURNETT'S BLUE