In [4]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import jaccard_score
from sklearn.model_selection import train_test_split


In [23]:
# Load the dataset
df = pd.read_csv("Augmented_Stockist_Data.csv")

# Compute Engagement Score = log(Sales Value) * (Feedback Score + Growth %)
df["Engagement_Score"] = np.log1p(df["Sales_Value_Last_Period"]) * (
    df["Feedback_Score"] + df["Growth_Percentage"]
)

train_df.to_csv("Train_Data.csv", index=False)
test_df.to_csv("Test_Data.csv", index=False)

# Display the first few rows
df.head()


Unnamed: 0,Partner_id,Product_id,Geography,Stockist_Type,Scheme_Type,Sales_Value_Last_Period,Sales_Quantity_Last_Period,MRP,Growth_Percentage,Discount_Applied,Bulk_Purchase_Tendency,New_Stockist,Feedback_Score,Engagement_Score
0,P1002,Solar Solutions,East,Wholesaler,Seasonal Offer,195613.18,384,22031.39,47.55,11.16,Low,False,2.36,608.098425
1,P1002,ACB,North,Retailer,Cashback,400519.6,355,10259.87,48.47,1.13,High,False,1.87,649.412201
2,P1001,BBT,East,Retailer,Cashback,229937.17,194,18661.56,12.58,15.99,Medium,True,2.91,191.232813
3,P1002,Modular Switches,West,Wholesaler,Cashback,10760.62,209,1649.93,0.04,13.37,High,False,1.47,14.018449
4,P1001,Pump Starter and Controller,West,Distributor,Cashback,139621.02,56,33020.52,10.19,19.14,Medium,False,1.5,138.487855


In [24]:
# Split the data into 80% training and 20% testing
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Optionally preview the shape
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)


Train shape: (16000, 14)
Test shape: (4000, 14)


In [25]:
# Create a Partner × Scheme matrix using mean Engagement Score
# user_scheme_matrix = train_df.pivot_table(
#     index="Partner_id",
#     columns="Scheme_Type",
#     values="Engagement_Score",
#     aggfunc="sum",   # sum = sum engagement score
#     fill_value=0
# )
item_scheme_matrix = train_df.pivot_table(
    index="Partner_id",
    columns="Scheme_Type",
    values="Engagement_Score",
    aggfunc="mean",   # Mean = average engagement score
    fill_value=0
)

# Preview the user-scheme engagement matrix
item_scheme_matrix.head()


Scheme_Type,Cashback,Loyalty Points,Loyalty Program,Seasonal Offer,Volume Discount
Partner_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
P1000,192.125817,0.0,266.388976,287.005946,183.201968
P1001,281.489728,285.548502,300.320644,284.172889,288.015261
P1002,280.758424,284.525329,240.035187,285.776877,282.624333
P1003,293.677473,267.175928,280.426208,272.237168,293.070777
P1004,269.783625,0.0,217.524748,281.90649,299.326598


In [26]:
# Group all schemes applied per (Partner, Product) combination
partner_product_schemes = train_df.groupby(["Partner_id", "Product_id"])["Scheme_Type"].apply(list).reset_index()

# Create an "Entity" column to represent each Partner-Product pair uniquely
partner_product_schemes["Entity"] = partner_product_schemes["Partner_id"] + "_" + partner_product_schemes["Product_id"]

# Preview the grouped data
partner_product_schemes.head()


Unnamed: 0,Partner_id,Product_id,Scheme_Type,Entity
0,P1000,ACB,"[Volume Discount, Volume Discount, Loyalty Pro...",P1000_ACB
1,P1000,AIS,"[Seasonal Offer, Seasonal Offer, Seasonal Offe...",P1000_AIS
2,P1000,BBT,"[Loyalty Program, Seasonal Offer, Seasonal Off...",P1000_BBT
3,P1000,Controller,"[Cashback, Loyalty Program, Loyalty Program, V...",P1000_Controller
4,P1000,E-House,"[Loyalty Program, Cashback, Loyalty Program, L...",P1000_E-House


In [27]:
# Transform the scheme list into a binary matrix (1 = scheme used, 0 = not used)
mlb = MultiLabelBinarizer()
scheme_matrix = pd.DataFrame(
    mlb.fit_transform(partner_product_schemes["Scheme_Type"]),
    index=partner_product_schemes["Entity"],
    columns=mlb.classes_
).T  # Transpose: rows become schemes, columns are Partner-Product pairs

# Preview the scheme matrix
scheme_matrix.head()


Entity,P1000_ACB,P1000_AIS,P1000_BBT,P1000_Controller,P1000_E-House,P1000_MCCB,P1000_Modular Switches,P1000_PSS,P1000_Pump Starter and Controller,P1000_RMU,...,P1100_MCCB,P1100_Modular Switches,P1100_PSS,P1100_Pump Starter and Controller,P1100_RMU,P1100_SDF,P1100_Solar Solutions,P1100_Starter,P1100_VCB,P1100_VCU
Cashback,0,1,0,1,1,1,0,1,1,1,...,1,1,1,0,0,1,1,0,1,1
Loyalty Points,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Loyalty Program,1,0,1,1,1,1,1,1,1,0,...,1,1,1,1,1,1,0,1,1,0
Seasonal Offer,1,1,1,1,1,1,1,1,1,1,...,0,1,1,1,1,1,0,0,1,0
Volume Discount,1,1,0,1,1,0,1,1,1,1,...,1,1,1,1,0,0,1,1,1,1


In [28]:
# Initialize an empty DataFrame to store Jaccard similarity between schemes
similarity_matrix = pd.DataFrame(index=scheme_matrix.index, columns=scheme_matrix.index, dtype=float)

# Compute Jaccard similarity for each scheme pair
for i in range(len(scheme_matrix)):
    for j in range(len(scheme_matrix)):
        if i != j:
            similarity_matrix.iloc[i, j] = jaccard_score(scheme_matrix.iloc[i], scheme_matrix.iloc[j])
        else:
            similarity_matrix.iloc[i, j] = 1.0  # Self similarity is always 1

# Preview similarity matrix
similarity_matrix.head()


Unnamed: 0,Cashback,Loyalty Points,Loyalty Program,Seasonal Offer,Volume Discount
Cashback,1.0,0.03937,0.583627,0.597612,0.59887
Loyalty Points,0.03937,1.0,0.028725,0.039753,0.040143
Loyalty Program,0.583627,0.028725,1.0,0.598425,0.565891
Seasonal Offer,0.597612,0.039753,0.598425,1.0,0.582163
Volume Discount,0.59887,0.040143,0.565891,0.582163,1.0


In [29]:
# Generate recommendations based on similarity
recommendations = []

# Unique (Partner, Product, Scheme) in test set
test_pairs = test_df[["Partner_id", "Product_id", "Scheme_Type"]].drop_duplicates()

# Loop through each test pair and recommend similar schemes
for _, row in test_pairs.iterrows():
    partner = row["Partner_id"]
    product = row["Product_id"]
    current_scheme = row["Scheme_Type"]
    entity_key = f"{partner}_{product}"

    if current_scheme in similarity_matrix.index:
        similar_schemes = similarity_matrix.loc[current_scheme].drop(current_scheme).sort_values(ascending=False).head(3)
        sim_list = similar_schemes.index.tolist()

        recommendations.append({
            "Partner_id": partner,
            "Product_id": product,
            "Similarity_Score": round(similar_schemes.mean(), 6),
            "Scheme_1": sim_list[0] if len(sim_list) > 0 else "No Scheme",
            "Scheme_2": sim_list[1] if len(sim_list) > 1 else "No Scheme",
            "Scheme_3": sim_list[2] if len(sim_list) > 2 else "No Scheme"
        })
    else:
        print(f"Scheme '{current_scheme}' not found in training data.")


In [30]:
# Convert to DataFrame and save to CSV
recommendation_df = pd.DataFrame(recommendations)
recommendation_df.to_csv("Scheme_Recommendations.csv", index=False)

# Preview final recommendations
recommendation_df.head()


Unnamed: 0,Partner_id,Product_id,Similarity_Score,Scheme_1,Scheme_2,Scheme_3
0,P1067,Modular Switches,0.59337,Volume Discount,Seasonal Offer,Loyalty Program
1,P1003,AIS,0.582308,Cashback,Seasonal Offer,Loyalty Program
2,P1003,ACB,0.039755,Volume Discount,Seasonal Offer,Cashback
3,P1003,VCU,0.592733,Loyalty Program,Cashback,Volume Discount
4,P1063,Pump Starter and Controller,0.592733,Loyalty Program,Cashback,Volume Discount


In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import jaccard_score
from sklearn.model_selection import train_test_split

# --- Load dataset ---
df = pd.read_csv("Augmented_Stockist_Data.csv")

# --- Compute Engagement Score ---
df["Engagement_Score"] = np.log1p(df["Sales_Value_Last_Period"]) * (
    df["Feedback_Score"] + df["Growth_Percentage"]
)

# --- Train-test split ---
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df.to_csv("Train_Data.csv", index=False)
test_df.to_csv("Test_Data.csv", index=False)

# --- Partner × Scheme matrix ---
item_scheme_matrix = train_df.pivot_table(
    index="Partner_id",
    columns="Scheme_Type",
    values="Engagement_Score",
    aggfunc="mean",
    fill_value=0
)

# --- Compute dynamic threshold (mean of all non-zero scores) ---
non_zero_scores = item_scheme_matrix[item_scheme_matrix > 0].stack()
threshold = non_zero_scores.mean()

# --- Binarize using dynamic threshold ---
binary_scheme_matrix = (item_scheme_matrix >= threshold).astype(int)

# --- Transpose to get Scheme × Partner matrix ---
scheme_matrix = binary_scheme_matrix.T

# --- Jaccard similarity between schemes ---
similarity_matrix = pd.DataFrame(index=scheme_matrix.index, columns=scheme_matrix.index, dtype=float)

for i in range(len(scheme_matrix)):
    for j in range(len(scheme_matrix)):
        if i != j:
            similarity_matrix.iloc[i, j] = jaccard_score(scheme_matrix.iloc[i], scheme_matrix.iloc[j])
        else:
            similarity_matrix.iloc[i, j] = 1.0

# --- Generate top-3 scheme recommendations per test pair ---
recommendations = []
test_pairs = test_df[["Partner_id", "Product_id", "Scheme_Type"]].drop_duplicates()

for _, row in test_pairs.iterrows():
    partner = row["Partner_id"]
    product = row["Product_id"]
    current_scheme = row["Scheme_Type"]

    if current_scheme in similarity_matrix.index:
        similar_schemes = similarity_matrix.loc[current_scheme].drop(current_scheme).sort_values(ascending=False).head(3)
        sim_list = similar_schemes.index.tolist()

        recommendations.append({
            "Partner_id": partner,
            "Product_id": product,
            "Similarity_Score": round(similar_schemes.mean(), 6),
            "Scheme_1": sim_list[0] if len(sim_list) > 0 else "No Scheme",
            "Scheme_2": sim_list[1] if len(sim_list) > 1 else "No Scheme",
            "Scheme_3": sim_list[2] if len(sim_list) > 2 else "No Scheme"
        })
    else:
        print(f"Scheme '{current_scheme}' not found in training data.")

# --- Save recommendations ---
recommendation_df = pd.DataFrame(recommendations)
recommendation_df.to_csv("Scheme_Recommendations.csv", index=False)

# --- Preview output ---
print(recommendation_df.head())


  Partner_id                   Product_id  Similarity_Score         Scheme_1  \
0      P1067             Modular Switches          0.327193   Seasonal Offer   
1      P1003                          AIS          0.332821   Seasonal Offer   
2      P1003                          ACB          0.032517  Volume Discount   
3      P1003                          VCU          0.342034         Cashback   
4      P1063  Pump Starter and Controller          0.342034         Cashback   

          Scheme_2         Scheme_3  
0  Volume Discount  Loyalty Program  
1         Cashback  Loyalty Program  
2   Seasonal Offer  Loyalty Program  
3  Volume Discount  Loyalty Program  
4  Volume Discount  Loyalty Program  


Evaluation Code

In [31]:
# Import required library
import pandas as pd

In [36]:
# Load the test data (long format — one row per availed scheme per partner)
test_df = pd.read_csv("Test_Data.csv")

# Load the recommendation data (top 3 recommended schemes per partner)
rec_df = pd.read_csv("Scheme_Recommendations.csv")


In [37]:
# Group by Partner_id to get list of all availed schemes
availed_df = (
    test_df.groupby("Partner_id")["Scheme_Type"]
    .apply(list)
    .reset_index()
    .rename(columns={"Scheme_Type": "Availed_Schemes"})
)

In [38]:
# Combine Scheme_1, Scheme_2, Scheme_3 into a single list column
rec_df["Recommended_Schemes"] = rec_df[["Scheme_1", "Scheme_2", "Scheme_3"]].values.tolist()


In [39]:
# Merge availed and recommended schemes using Partner_id
df_all = pd.merge(
    availed_df,
    rec_df[["Partner_id", "Recommended_Schemes"]],
    on="Partner_id",
    how="left"
)

# Ensure both lists are properly formatted
df_all["Availed_Schemes"] = df_all["Availed_Schemes"].apply(lambda x: x if isinstance(x, list) else [])
df_all["Recommended_Schemes"] = df_all["Recommended_Schemes"].apply(lambda x: x if isinstance(x, list) else [])


In [40]:
# Initialize variables
k_list = [1, 2, 3]
results = []

# Evaluate precision, recall, F1 for each Top-K level
for k in k_list:
    precision_list = []
    recall_list = []

    for _, row in df_all.iterrows():
        actual_set = set(row["Availed_Schemes"])
        recommended_k = row["Recommended_Schemes"][:k]  # Top-K recommendations

        if not actual_set:
            continue  # skip if no availed schemes

        # Count correct predictions in Top-K
        tp = sum([1 for scheme in recommended_k if scheme in actual_set])
        precision = tp / k
        recall = tp / len(actual_set)

        precision_list.append(precision)
        recall_list.append(recall)
        
    # Average the metrics across all partners
    avg_precision = round(sum(precision_list) / len(precision_list), 4) if precision_list else 0
    avg_recall = round(sum(recall_list) / len(recall_list), 4) if recall_list else 0
    f1 = round(2 * avg_precision * avg_recall / (avg_precision + avg_recall), 4) if (avg_precision + avg_recall) else 0

    results.append({
        "Top-K": k,
        "Avg Precision": avg_precision,
        "Avg Recall": avg_recall,
        "Avg F1 Score": f1
    })



In [41]:
# Print Top-K per-scheme evaluation metrics
print("==== Per-Scheme Evaluation (WITH Availed Schemes) ====")
for r in results:
    print(f"\nTop-{r['Top-K']}")
    print(f"  Avg Precision : {r['Avg Precision']}")
    print(f"  Avg Recall    : {r['Avg Recall']}")
    print(f"  Avg F1 Score  : {r['Avg F1 Score']}")


==== Per-Scheme Evaluation (WITH Availed Schemes) ====

Top-1
  Avg Precision : 0.9913
  Avg Recall    : 0.245
  Avg F1 Score  : 0.3929

Top-2
  Avg Precision : 0.9855
  Avg Recall    : 0.4862
  Avg F1 Score  : 0.6512

Top-3
  Avg Precision : 0.9875
  Avg Recall    : 0.7312
  Avg F1 Score  : 0.8402
