In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

In [17]:
# Load dataset
df = pd.read_csv("Revised_Codes/Final Codes/Baseline Recommendation model/Direct/User-Based/Augmented_Stockist_Dat.csv")

# # # One-hot encoding for Geography and Stockist_Type
# df = pd.get_dummies(df, columns=["Geography", "Stockist_Type"], dtype=int)
 
# # Identify geography and stockist type columns
# geo_columns = [col for col in df.columns if col.startswith("Geography")]
# stockist_columns = [col for col in df.columns if col.startswith("Stockist_Type")]
 
# if not geo_columns or not stockist_columns:
#     raise ValueError("No Geography or Stockist_Type features found after encoding! Check encoding step.")
 
# # Ensure Sales_Value_Last_Period does not contain zeros to avoid log(0)
# df["Sales_Value_Last_Period"] = df["Sales_Value_Last_Period"].replace(0, 1)
 
# Compute Engagement Score = log(Sales Value) * (Feedback + Growth %)
df["Engagement_Score"] = np.log1p(df["Sales_Value_Last_Period"]) * (
    df["Feedback_Score"] + df["Growth_Percentage"]
)

In [18]:
# Train-Test Split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["Partner_id"])

# # Pivot User-Scheme Matrix using Engagement Score
# user_scheme_matrix = train_df.pivot_table(
#     index="Partner_id", columns="Scheme_Type", values="Engagement_Score", aggfunc="sum", fill_value=0
# )

# Pivot User-Scheme Matrix using Engagement Score
user_scheme_matrix = train_df.pivot_table(
    index="Partner_id", columns="Scheme_Type", values="Engagement_Score", aggfunc="mean", fill_value=0
)
train_df.to_csv("train_data.csv", index=False)
test_df.to_csv("test_data.csv", index=False)


In [19]:
user_scheme_matrix

Scheme_Type,Cashback,Loyalty Points,Loyalty Program,Seasonal Offer,Volume Discount
Partner_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
P1000,266.287178,0.000000,247.152747,278.503879,192.614103
P1001,253.747512,256.848606,191.607857,252.361077,250.167273
P1002,248.175090,259.416539,190.738863,255.106867,253.578719
P1003,260.397812,230.108507,235.007932,239.021409,254.668759
P1004,202.757154,0.000000,257.549599,241.846588,242.296454
...,...,...,...,...,...
P1096,205.518858,0.000000,211.622811,341.724454,212.675405
P1097,221.052802,0.000000,299.626569,187.602793,277.689874
P1098,252.359124,0.000000,273.677549,364.697174,258.467401
P1099,310.929162,0.000000,235.842222,264.669109,350.700260


In [20]:
# # Add Geography & Stockist_Type Features
# user_features = train_df.groupby("Partner_id")[geo_columns + stockist_columns].mean()  # Aggregate features per Partner_id
# user_scheme_matrix = user_scheme_matrix.merge(user_features, left_index=True, right_index=True, how="left")

In [21]:
# Prepare sparse matrix
user_scheme_sparse = csr_matrix(user_scheme_matrix.values)
partner_id_lookup = list(user_scheme_matrix.index)

In [22]:
# Fit Nearest Neighbors (Cosine Similarity)
'''Brute: model to compute all pairwise distances, it compares the vector to all other vectors in 
the dataset to find the closest matches.'''
knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_model.fit(user_scheme_sparse)


In [23]:
# Recommendation Function
'''Uses user-based collaborative filtering via cosine similarity.
Finds the most similar user to the target partner.
Recommends that user's top 3 most-used schemes.
Aims to personalize scheme recommendations based on behavior similarity.'''

def recommend_user_based(partner_id, top_n=3):
    if partner_id not in user_scheme_matrix.index:
        return None

    idx = partner_id_lookup.index(partner_id)
    distances, indices = knn_model.kneighbors(user_scheme_sparse[idx], n_neighbors=min(top_n + 1, len(user_scheme_matrix)))
    similarities = 1 - distances.flatten()
    neighbors = indices.flatten()

    filtered = [(i, sim) for i, sim in zip(neighbors, similarities) if i != idx]
    if not filtered:
        return None

    top_idx, sim_score = filtered[0]
    similar_user = partner_id_lookup[top_idx]
    sim_score = round(sim_score, 6)


    top_schemes = (
        train_df[train_df["Partner_id"] == similar_user]["Scheme_Type"]
        .value_counts().head(3).index.tolist()
    )
    while len(top_schemes) < 3:
        top_schemes.append("No Scheme")

    product = train_df[train_df["Partner_id"] == partner_id]["Product_id"].unique()[0]

    return [partner_id, product, sim_score, *top_schemes]


In [24]:
# Generate Recommendations
user_partners = test_df["Partner_id"].unique()
user_recommendations = [recommend_user_based(pid) for pid in user_partners if recommend_user_based(pid)]

# Save Output
user_rec_df = pd.DataFrame(user_recommendations, columns=["Partner_id", "Product_id", "Similarity_Score", "Scheme_1", "Scheme_2", "Scheme_3"])
user_rec_df.to_csv("user_based_recommendations_enhanced.csv", index=False)

print("User-Based Recommendations saved.")


User-Based Recommendations saved.


Evaluation code

In [25]:
# Import required library
import pandas as pd

In [26]:
# Load the test data (long format — one row per availed scheme per partner)
test_df = pd.read_csv("test_data.csv")

# Load the recommendation data (top 3 recommended schemes per partner)
rec_df = pd.read_csv("user_based_recommendations_enhanced.csv")


In [27]:
# Group by Partner_id to get list of all availed schemes
availed_df = (
    test_df.groupby("Partner_id")["Scheme_Type"]
    .apply(list)
    .reset_index()
    .rename(columns={"Scheme_Type": "Availed_Schemes"})
)

In [28]:
# Combine Scheme_1, Scheme_2, Scheme_3 into a single list column
rec_df["Recommended_Schemes"] = rec_df[["Scheme_1", "Scheme_2", "Scheme_3"]].values.tolist()


In [29]:
# Merge availed and recommended schemes using Partner_id
df_all = pd.merge(
    availed_df,
    rec_df[["Partner_id", "Recommended_Schemes"]],
    on="Partner_id",
    how="left"
)

# Ensure both lists are properly formatted
df_all["Availed_Schemes"] = df_all["Availed_Schemes"].apply(lambda x: x if isinstance(x, list) else [])
df_all["Recommended_Schemes"] = df_all["Recommended_Schemes"].apply(lambda x: x if isinstance(x, list) else [])


In [30]:
# Initialize variables
k_list = [1, 2, 3]
results = []

# Evaluate precision, recall, F1 for each Top-K level
for k in k_list:
    precision_list = []
    recall_list = []

    for _, row in df_all.iterrows():
        actual_set = set(row["Availed_Schemes"])
        recommended_k = row["Recommended_Schemes"][:k]  # Top-K recommendations

        if not actual_set:
            continue  # skip if no availed schemes

        # Count correct predictions in Top-K
        tp = sum([1 for scheme in recommended_k if scheme in actual_set])
        precision = tp / k
        recall = tp / len(actual_set)
        
        precision_list.append(precision)
        recall_list.append(recall)

    # Average the metrics across all partners
    avg_precision = round(sum(precision_list) / len(precision_list), 4) if precision_list else 0
    avg_recall = round(sum(recall_list) / len(recall_list), 4) if recall_list else 0
    f1 = round(2 * avg_precision * avg_recall / (avg_precision + avg_recall), 4) if (avg_precision + avg_recall) else 0

    results.append({
        "Top-K": k,
        "Avg Precision": avg_precision,
        "Avg Recall": avg_recall,
        "Avg F1 Score": f1
    })


In [31]:
# Print Top-K per-scheme evaluation metrics
print("==== Per-Scheme Evaluation (WITH Availed Schemes) ====")
for r in results:
    print(f"\nTop-{r['Top-K']}")
    print(f"  Avg Precision : {r['Avg Precision']}")
    print(f"  Avg Recall    : {r['Avg Recall']}")
    print(f"  Avg F1 Score  : {r['Avg F1 Score']}")


==== Per-Scheme Evaluation (WITH Availed Schemes) ====

Top-1
  Avg Precision : 0.9901
  Avg Recall    : 0.2485
  Avg F1 Score  : 0.3973

Top-2
  Avg Precision : 0.9851
  Avg Recall    : 0.4937
  Avg F1 Score  : 0.6578

Top-3
  Avg Precision : 0.9901
  Avg Recall    : 0.7455
  Avg F1 Score  : 0.8506
