In [45]:
! py -m pip install implicit  

Collecting implicit
  Downloading implicit-0.7.2-cp310-cp310-win_amd64.whl.metadata (6.3 kB)
Downloading implicit-0.7.2-cp310-cp310-win_amd64.whl (748 kB)
   ---------------------------------------- 748.6/748.6 kB 6.2 MB/s eta 0:00:00
Installing collected packages: implicit
Successfully installed implicit-0.7.2



[notice] A new release of pip is available: 25.0.1 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [27]:
# import pandas as pd, numpy as np
# from sklearn.preprocessing import StandardScaler
# from sklearn.cluster import KMeans
# import joblib

# # ----------------------------------------
# # 1.  Build (or refresh) clusters offline
# # ----------------------------------------
# fs   = pd.read_csv("feature_store_enhanced.csv")
# num_cols = [
#     "Age", "Annual_Income", "Total_Spend", "Num_Transactions",
#     "Spend_Grocery", "Spend_Travel", "Spend_Fuel",
#     "Spend_Dining", "Spend_Education", "Salary_to_Spend_Ratio"
# ]
# X = fs[num_cols].fillna(0)

# scaler = StandardScaler()
# X_std  = scaler.fit_transform(X)

# k = 8                                      # 6–10 clusters works well
# km = KMeans(n_clusters=k, random_state=42)
# cluster_labels = km.fit_predict(X_std)

# save artefacts
# fs["segment"] = cluster_labels
# fs[["Customer_ID","segment"]].to_csv("segment_map.csv", index=False)
# joblib.dump(km,     "segment_kmeans.joblib")
# joblib.dump(scaler, "segment_scaler.joblib")


# cust_seg = pd.read_csv("segment_map.csv").set_index("Customer_ID")["segment"]
# cust_prod = pd.read_csv("customer_products.csv",
#                         parse_dates=["Acquisition_Date"])

# takeup = (
#     cust_prod
#     #   .query("Acquisition_Date.between('2024-04-01','2024-06-30')")
#       .merge(cust_seg, on="Customer_ID")
#       .groupby(["segment","Product_ID"]).size()
#       .div(cust_seg.value_counts())        # denominator per cluster
#       .unstack(fill_value=0)               # rows = segment, cols = product
# )

# takeup.to_parquet("segment_takeup.parquet")

In [28]:
import pandas as pd, joblib

# Load artefacts once at app start-up
scaler = joblib.load("segment_scaler.joblib")
kmeans = joblib.load("segment_kmeans.joblib")

# Same feature list used during training
SEG_FEATS = [
    "Age", "Annual_Income", "Total_Spend", "Num_Transactions",
    "Spend_Grocery", "Spend_Travel", "Spend_Fuel",
    "Spend_Dining", "Spend_Education", "Salary_to_Spend_Ratio"
]


### save new customer in one of the clusters

In [None]:
from langchain.agents import tool
import json

@tool
def assign_segment(input: str) -> str:
    """
    Input  ► JSON {"customer_id": "CUST9999"}
    Output ► JSON {"customer_id": "CUST9999", "segment": 3}

    • Pulls the customer's feature row
    • Applies saved scaler → k-means.predict
    • Returns the cluster label (0-7)
    """
    payload = json.loads(input)
    cid     = payload["customer_id"]

    # 1. fetch row from feature_store
    row = pd.read_sql(
        f"SELECT {', '.join(SEG_FEATS)} FROM feature_store "
        f"WHERE Customer_ID='{cid}'", conn
    )

    if row.empty:
        return json.dumps({"customer_id": cid, "segment": None})

    # 2. scale & predict
    X_std   = scaler.transform(row[SEG_FEATS].fillna(0))
    seg_id  = int(kmeans.predict(X_std)[0])

    # (optional) append to in-memory mapping for later lookups
    seg_map = pd.read_csv("segment_map.csv").set_index("Customer_ID")["segment"]
    seg_map[cid] = seg_id       # seg_map is the Series you loaded earlier

    return json.dumps({"customer_id": cid, "segment": seg_id})


## batch updates

In [None]:
takeup = (
    customer_products
      .merge(seg_map, on="Customer_ID")
      .groupby(["segment","Product_ID"]).size()
      .div(seg_map.value_counts())
      .unstack(fill_value=0)
)
takeup.to_parquet("segment_takeup.parquet")


### Association Rules

In [43]:
# -------------------------------------------------------------
# 0)  Install mlxtend if you haven't yet
#      pip install mlxtend
# -------------------------------------------------------------
import pandas as pd
import json
from mlxtend.frequent_patterns import apriori, association_rules

# ------------------------------------------------------------------
# 1)  Load customer–product ownership
# ------------------------------------------------------------------
cust_prod = pd.read_csv("customer_products.csv")   # or SELECT from DB
# Expected columns: Customer_ID, Product_ID, Acquisition_Date, …

# ------------------------------------------------------------------
# 2)  Build a customer×product boolean matrix (one row per customer)
# ------------------------------------------------------------------
# 2) Build customer × product matrix as Boolean
basket = (
    cust_prod
      .assign(flag=1)
      .pivot_table(index="Customer_ID",
                   columns="Product_ID",
                   values="flag",
                   aggfunc="max",
                   fill_value=0)
      .astype(bool)           # <-- make values True / False
)

# 3) Frequent itemsets
itemsets = apriori(
    basket,
    min_support=0.03,
    use_colnames=True
)

# ------------------------------------------------------------------
# 4)  Association rules
#     - metric = 'lift' gives us the lift value directly
# ------------------------------------------------------------------
rules = association_rules(
    itemsets,
    metric="lift",
    min_threshold=1.5      # keep only lift ≥ 1
)

# ------------------------------------------------------------------
# 5)  Convert frozensets → list, keep key metrics, and dump to JSON
# ------------------------------------------------------------------
rules_out = []
keep_cols = [
    "antecedent support", "consequent support", "support",
    "confidence", "lift", "leverage", "conviction"
]

for _, row in rules.iterrows():
    rules_out.append({
        "antecedents": list(row["antecedents"]),
        "consequents": list(row["consequents"]),
        **{k: float(row[k]) for k in keep_cols}
    })

with open("synergy_rules.json", "w") as f:
    json.dump(rules_out, f, indent=2)

print(f"✅ Wrote {len(rules_out)} rules to synergy_rules.json")


✅ Wrote 20 rules to synergy_rules.json


## ALS algorithm

In [None]:
# pip install implicit        (GPU optional)
import pandas as pd, numpy as np, scipy.sparse as sp, joblib
from implicit.als import AlternatingLeastSquares

# 1) Build customer–product sparse matrix
cust_prod = pd.read_csv("customer_products.csv")
row_idx   = {cid: i for i, cid in enumerate(cust_prod["Customer_ID"].unique())}
col_idx   = {pid: j for j, pid in enumerate(cust_prod["Product_ID"].unique())}

rows = cust_prod["Customer_ID"].map(row_idx)
cols = cust_prod["Product_ID"].map(col_idx)
data = np.ones(len(cust_prod))           # implicit feedback = 1

matrix = sp.csr_matrix((data, (rows, cols)),
                       shape=(len(row_idx), len(col_idx)))

# 2) Train ALS (implicit)
als = AlternatingLeastSquares(factors=8,
                              regularization=0.1,
                              iterations=30,
                              random_state=42)
als.fit(matrix)

# 3) Persist artefacts
joblib.dump(als,      "als_model.joblib")
joblib.dump(row_idx,  "user_index.joblib")
joblib.dump(col_idx,  "item_index.joblib")
print("✅ ALS model & indices saved.")


  check_blas_config()


  0%|          | 0/30 [00:00<?, ?it/s]

✅ ALS model & indices saved.
