In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

In [27]:
# Load dataset
df = pd.read_csv("Augmented_Stockist_Data.csv")

# # # One-hot encoding for Geography and Stockist_Type
# df = pd.get_dummies(df, columns=["Geography", "Stockist_Type"], dtype=int)
 
# # Identify geography and stockist type columns
# geo_columns = [col for col in df.columns if col.startswith("Geography")]
# stockist_columns = [col for col in df.columns if col.startswith("Stockist_Type")]
 
# if not geo_columns or not stockist_columns:
#     raise ValueError("No Geography or Stockist_Type features found after encoding! Check encoding step.")
 
# # Ensure Sales_Value_Last_Period does not contain zeros to avoid log(0)
# df["Sales_Value_Last_Period"] = df["Sales_Value_Last_Period"].replace(0, 1)
 
# Compute Engagement Score = log(Sales Value) * (Feedback + Growth %)
df["Engagement_Score"] = np.log1p(df["Sales_Value_Last_Period"]) * (
    df["Feedback_Score"] + df["Growth_Percentage"]
)

In [28]:
# Train-Test Split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["Partner_id"])

# Pivot User-Scheme Matrix using Engagement Score
user_scheme_matrix = train_df.pivot_table(
    index="Partner_id", columns="Scheme_Type", values="Engagement_Score", aggfunc="sum", fill_value=0
)

In [29]:
user_scheme_matrix

Scheme_Type,Cashback,Loyalty Points,Loyalty Program,Seasonal Offer,Volume Discount
Partner_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
P1000,5624.652065,0.000000,6857.821061,8615.187048,4440.781870
P1001,191673.209996,201711.343636,5182.589408,192524.020053,193165.701180
P1002,185441.004775,183507.850525,4452.983595,203583.767774,197049.448445
P1003,209294.199655,173065.259729,5000.926922,190653.339327,186162.472231
P1004,4570.956423,0.000000,5979.091852,5198.097533,4903.365058
...,...,...,...,...,...
P1096,5280.864743,0.000000,6121.352940,6058.569649,4813.450923
P1097,6877.834282,0.000000,4881.891004,3522.923646,4954.247191
P1098,4476.946134,0.000000,6338.079130,9602.286576,7376.727396
P1099,8850.801109,0.000000,5528.362456,4234.029931,7028.747390


In [30]:
# # Add Geography & Stockist_Type Features
# user_features = train_df.groupby("Partner_id")[geo_columns + stockist_columns].mean()  # Aggregate features per Partner_id
# user_scheme_matrix = user_scheme_matrix.merge(user_features, left_index=True, right_index=True, how="left")

In [31]:
# Prepare sparse matrix
user_scheme_sparse = csr_matrix(user_scheme_matrix.values)
partner_id_lookup = list(user_scheme_matrix.index)

In [32]:
# Fit Nearest Neighbors (Cosine Similarity)
'''Brute: model to compute all pairwise distances, it compares the vector to all other vectors in 
the dataset to find the closest matches.'''
knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_model.fit(user_scheme_sparse)


In [33]:
# Recommendation Function
'''Uses user-based collaborative filtering via cosine similarity.
Finds the most similar user to the target partner.
Recommends that user's top 3 most-used schemes.
Aims to personalize scheme recommendations based on behavior similarity.'''

def recommend_user_based(partner_id, top_n=3):
    if partner_id not in user_scheme_matrix.index:
        return None

    idx = partner_id_lookup.index(partner_id)
    distances, indices = knn_model.kneighbors(user_scheme_sparse[idx], n_neighbors=min(top_n + 1, len(user_scheme_matrix)))
    similarities = 1 - distances.flatten()
    neighbors = indices.flatten()

    filtered = [(i, sim) for i, sim in zip(neighbors, similarities) if i != idx]
    if not filtered:
        return None

    top_idx, sim_score = filtered[0]
    similar_user = partner_id_lookup[top_idx]
    sim_score = round(sim_score, 6)


    top_schemes = (
        train_df[train_df["Partner_id"] == similar_user]["Scheme_Type"]
        .value_counts().head(3).index.tolist()
    )
    while len(top_schemes) < 3:
        top_schemes.append("No Scheme")

    product = train_df[train_df["Partner_id"] == partner_id]["Product_id"].unique()[0]

    return [partner_id, product, sim_score, *top_schemes]


In [34]:
# Generate Recommendations
user_partners = test_df["Partner_id"].unique()
user_recommendations = [recommend_user_based(pid) for pid in user_partners if recommend_user_based(pid)]

# Save Output
user_rec_df = pd.DataFrame(user_recommendations, columns=["Partner_id", "Product_id", "Similarity_Score", "Scheme_1", "Scheme_2", "Scheme_3"])
user_rec_df.to_csv("user_based_recommendations_enhanced.csv", index=False)

print("User-Based Recommendations saved.")


User-Based Recommendations saved.
