In [9]:
import pandas as pd
import numpy as np
from sklearn.metrics import jaccard_score
from sklearn.model_selection import train_test_split

In [10]:
# --- Load dataset ---
df = pd.read_csv("Augmented_Stockist_Data.csv")

# --- Compute Engagement Score ---
df["Engagement_Score"] = np.log1p(df["Sales_Value_Last_Period"]) * (
    df["Feedback_Score"] + df["Growth_Percentage"]
)

In [11]:
# --- Train-test split ---
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df.to_csv("Train_Data.csv", index=False)
test_df.to_csv("Test_Data.csv", index=False)
# --- Partner × Scheme matrix ---
item_scheme_matrix = train_df.pivot_table(
    index="Partner_id",
    columns="Scheme_Type",
    values="Engagement_Score",
    aggfunc="mean",
    fill_value=0
)

In [12]:
# --- Compute dynamic threshold (mean of all non-zero scores) ---
non_zero_scores = item_scheme_matrix[item_scheme_matrix > 0].stack()
threshold = non_zero_scores.mean()

# --- Binarize using dynamic threshold ---
binary_scheme_matrix = (item_scheme_matrix >= threshold).astype(int)

# --- Transpose to get Scheme × Partner matrix ---
scheme_matrix = binary_scheme_matrix.T

# --- Jaccard similarity between schemes ---
similarity_matrix = pd.DataFrame(index=scheme_matrix.index, columns=scheme_matrix.index, dtype=float)

for i in range(len(scheme_matrix)):
    for j in range(len(scheme_matrix)):
        if i != j:
            similarity_matrix.iloc[i, j] = jaccard_score(
                scheme_matrix.iloc[i].values, scheme_matrix.iloc[j].values
            )
        else:
            similarity_matrix.iloc[i, j] = 1.0


In [13]:
# --- Generate top-3 scheme recommendations per test pair ---
recommendations = []
test_pairs = test_df[["Partner_id", "Product_id", "Scheme_Type"]].drop_duplicates()

for _, row in test_pairs.iterrows():
    partner = row["Partner_id"]
    product = row["Product_id"]
    current_scheme = row["Scheme_Type"]

    if current_scheme in similarity_matrix.index:
        similar_schemes = similarity_matrix.loc[current_scheme].drop(current_scheme).sort_values(ascending=False).head(3)
        sim_list = similar_schemes.index.tolist()

        recommendations.append({
            "Partner_id": partner,
            "Product_id": product,
            "Similarity_Score": round(similar_schemes.mean(), 6),
            "Scheme_1": sim_list[0] if len(sim_list) > 0 else "No Scheme",
            "Scheme_2": sim_list[1] if len(sim_list) > 1 else "No Scheme",
            "Scheme_3": sim_list[2] if len(sim_list) > 2 else "No Scheme"
        })
    else:
        print(f"Scheme '{current_scheme}' not found in training data.")


In [14]:
# --- Save recommendations ---
recommendation_df = pd.DataFrame(recommendations)
recommendation_df.to_csv("Scheme_Recommendations.csv", index=False)

# --- Preview output ---
print(recommendation_df.head())


  Partner_id                   Product_id  Similarity_Score         Scheme_1  \
0      P1067             Modular Switches          0.327193   Seasonal Offer   
1      P1003                          AIS          0.332821   Seasonal Offer   
2      P1003                          ACB          0.032517  Volume Discount   
3      P1003                          VCU          0.342034         Cashback   
4      P1063  Pump Starter and Controller          0.342034         Cashback   

          Scheme_2         Scheme_3  
0  Volume Discount  Loyalty Program  
1         Cashback  Loyalty Program  
2   Seasonal Offer  Loyalty Program  
3  Volume Discount  Loyalty Program  
4  Volume Discount  Loyalty Program  


Evaluation Code

In [15]:
# Import required library
import pandas as pd

In [16]:
# Load the test data (long format — one row per availed scheme per partner)
test_df = pd.read_csv("Test_Data.csv")

# Load the recommendation data (top 3 recommended schemes per partner)
rec_df = pd.read_csv("Scheme_Recommendations.csv")


In [17]:
# Group by Partner_id to get list of all availed schemes
availed_df = (
    test_df.groupby("Partner_id")["Scheme_Type"]
    .apply(list)
    .reset_index()
    .rename(columns={"Scheme_Type": "Availed_Schemes"})
)

In [18]:
# Combine Scheme_1, Scheme_2, Scheme_3 into a single list column
rec_df["Recommended_Schemes"] = rec_df[["Scheme_1", "Scheme_2", "Scheme_3"]].values.tolist()


In [19]:
# Merge availed and recommended schemes using Partner_id
df_all = pd.merge(
    availed_df,
    rec_df[["Partner_id", "Recommended_Schemes"]],
    on="Partner_id",
    how="left"
)

# Ensure both lists are properly formatted
df_all["Availed_Schemes"] = df_all["Availed_Schemes"].apply(lambda x: x if isinstance(x, list) else [])
df_all["Recommended_Schemes"] = df_all["Recommended_Schemes"].apply(lambda x: x if isinstance(x, list) else [])


In [20]:
# Initialize variables
k_list = [1, 2, 3]
results = []

# Evaluate precision, recall, F1 for each Top-K level
for k in k_list:
    precision_list = []
    recall_list = []

    for _, row in df_all.iterrows():
        actual_set = set(row["Availed_Schemes"])
        recommended_k = row["Recommended_Schemes"][:k]  # Top-K recommendations

        if not actual_set:
            continue  # skip if no availed schemes

        # Count correct predictions in Top-K
        tp = sum([1 for scheme in recommended_k if scheme in actual_set])
        precision = tp / k
        recall = tp / len(actual_set)

        precision_list.append(precision)
        recall_list.append(recall)
        
    # Average the metrics across all partners
    avg_precision = round(sum(precision_list) / len(precision_list), 4) if precision_list else 0
    avg_recall = round(sum(recall_list) / len(recall_list), 4) if recall_list else 0
    f1 = round(2 * avg_precision * avg_recall / (avg_precision + avg_recall), 4) if (avg_precision + avg_recall) else 0

    results.append({
        "Top-K": k,
        "Avg Precision": avg_precision,
        "Avg Recall": avg_recall,
        "Avg F1 Score": f1
    })



In [21]:
# Print Top-K per-scheme evaluation metrics
print("==== Per-Scheme Evaluation (WITH Availed Schemes) ====")
for r in results:
    print(f"\nTop-{r['Top-K']}")
    print(f"  Avg Precision : {r['Avg Precision']}")
    print(f"  Avg Recall    : {r['Avg Recall']}")
    print(f"  Avg F1 Score  : {r['Avg F1 Score']}")


==== Per-Scheme Evaluation (WITH Availed Schemes) ====

Top-1
  Avg Precision : 0.9782
  Avg Recall    : 0.2407
  Avg F1 Score  : 0.3863

Top-2
  Avg Precision : 0.9823
  Avg Recall    : 0.484
  Avg F1 Score  : 0.6485

Top-3
  Avg Precision : 0.9875
  Avg Recall    : 0.7312
  Avg F1 Score  : 0.8402
