In [436]:
import pandas as pd
import numpy as np
from rapidfuzz import fuzz, process
from rank_bm25 import BM25Okapi

In [7]:
excel_file = '../data-dockyard/250820_Indirect spend analysis_v11.xlsx'
xls = pd.ExcelFile(excel_file)
xls.sheet_names

['Index',
 'Savings potential',
 'Category wise spend',
 'AVD_3Y',
 'Single sourcing_1Y',
 'Negotiation potential',
 'Vendors and SW for negotiation',
 'Rate contract check',
 'Tail spend_by SKU',
 'Tail spend_by vendor',
 'Services contract pooling',
 'Global marketing',
 'Marketing_Alternate vendor',
 '1Y data_Suresh',
 'Sheet3',
 'Sheet2',
 'Marketing spend split',
 '3Y data_Raneeth',
 'FY22-23',
 'FY23-24',
 'FY24-25']

In [11]:
df_tailsku = pd.read_excel(excel_file, sheet_name='Tail spend_by SKU', skiprows=11564)

In [15]:
df_tailsku = df_tailsku.drop(columns = ['Unnamed: 0','Unnamed: 5'])

In [47]:
# limit analysis to skus which cover 80% of the spends
df_tailsku['Spend % cum'] = df_tailsku['Spend %'].cumsum()
cutoff = 0.8 
mvp_skus_cutoff = df_tailsku[df_tailsku['Spend % cum'] >= cutoff].index[0]
df_tailsku_mvp = df_tailsku.iloc[:mvp_skus_cutoff, :]

In [51]:
print(f"{round(100*df_tailsku_mvp.shape[0]/df_tailsku.shape[0])}% ({df_tailsku_mvp.shape[0]}) of skus make up {int(100*cutoff)}% of spends!")

5% (573) of skus make up 80% of spends!


In [73]:
# Build universe of searchable SKUs 
SKU_UNIVERSE_LIST = df_tailsku_mvp['Short Text'].values

In [75]:
SKU_UNIVERSE_LIST[:4]

array(['Supply Chain Related Cost', 'Fulphila Copay',
       'BIO-PERTUZ-301- ChO2', 'Distribution & SCM Related'], dtype=object)

----

In [466]:
corpus = [
    "Hello there good man!",
    "It is quite windy in London",
    "How is the weather today?"
]
bm25 = BM25Okapi(tokenized_corpus)

In [468]:
query = "windy London"
tokenized_query = query.split(" ")
bm25.get_top_n(tokenized_query, corpus, n=1)

['It is quite windy in London']

In [476]:
def search_skus(user_query, n=3, search_algo='random', sku_universe=SKU_UNIVERSE_LIST):
    print(f'Searching for {user_query} ..\n')
    search_results = []
    df_sku_universe = pd.DataFrame(sku_universe, columns=['sku_short_text'])
    if search_algo == 'random':    
        search_results = np.random.choice(sku_universe, size=n, replace=False)
    elif search_algo == 'fuzzy':
        search_results = process.extract(user_query, sku_universe, scorer=fuzz.token_sort_ratio, limit=n)
    elif search_algo == 'bm25':
        tokenized_sku_universe = [doc.split(" ") for doc in sku_universe]
        bm25 = BM25Okapi(tokenized_sku_universe)
        tokenized_user_query = user_query.split(" ")
        search_results = bm25.get_top_n(tokenized_user_query, sku_universe, n=n)
    print(f'Finished Searching!')
    for e, r in enumerate(search_results):
        print(e+1,r)
    return search_results

In [478]:
search_results = search_skus('TriSoyAgar Plates', search_algo='fuzzy', n=3)
search_results = search_skus('bio-pertuz-301', search_algo='fuzzy', n=3)

Searching for TriSoyAgar Plates ..

Finished Searching!
1 ('professional fees', 41.17647058823529, 11)
2 ('TSA 3P IRR Neutralizers', 40.0, 35)
3 ('Pegasus Project Charges_ JSA', 40.0, 474)
Searching for bio-pertuz-301 ..

Finished Searching!
1 ('GP-Expenses-423,488.340', 32.432432432432435, 97)
2 ('Tablet', 30.000000000000004, 116)
3 ('BIO-PERTUZ-301- ChO2', 29.411764705882348, 2)


In [486]:
search_results = search_skus('TriSoyAgar Plates', search_algo='bm25', n=3)
search_results = search_skus('bio-pertuz-301', search_algo='bm25', n=3)

Searching for TriSoyAgar Plates ..

Finished Searching!
1 Legal Services Rendered Through January
2 GERS Studies-Pharmacy Sell In/Sell out D
3 Bmab1800 RLD proc at Syngene
Searching for bio-pertuz-301 ..

Finished Searching!
1 Legal Services Rendered Through January
2 GERS Studies-Pharmacy Sell In/Sell out D
3 Bmab1800 RLD proc at Syngene
