In [8]:
import pandas as pd
import numpy as np
from copy import deepcopy
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

# Load dataset
df = pd.read_csv(r"C:\Users\291688\Downloads\Scheme_Recommendation_Explainable_ai\Dataset\Augmented_Stockist_Data (1).csv")

# One-hot encoding for Geography and Stockist_Type
df = pd.get_dummies(df, columns=["Geography", "Stockist_Type"], dtype=int)

# Identify geography and stockist type columns
geo_columns = [col for col in df.columns if col.startswith("Geography")]
stockist_columns = [col for col in df.columns if col.startswith("Stockist_Type")]

if not geo_columns or not stockist_columns:
    raise ValueError("No Geography or Stockist_Type features found after encoding! Check encoding step.")

# Ensure Sales_Value_Last_Period does not contain zeros to avoid log(0)
df["Sales_Value_Last_Period"] = df["Sales_Value_Last_Period"].replace(0, 1)

# Calculate Engagement Score
df["Engagement_Score"] = np.log1p(df["Sales_Value_Last_Period"]) * (df["Feedback_Score"] + df[geo_columns+stockist_columns].sum(axis=1))

# Train-Test Split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["Partner_id"])

# Pivot User-Scheme Matrix using Engagement Score
user_scheme_matrix = train_df.pivot_table(
    index="Partner_id", columns="Scheme_Type", values="Engagement_Score", aggfunc="sum", fill_value=0
)

# Add Geography & Stockist_Type Features
user_features = train_df.groupby("Partner_id")[geo_columns + stockist_columns].mean()  # Aggregate features per Partner_id
user_scheme_matrix = user_scheme_matrix.merge(user_features, left_index=True, right_index=True, how="left")

# Prepare sparse matrix
user_scheme_sparse = csr_matrix(user_scheme_matrix.values)

partner_id_lookup = list(user_scheme_matrix.index)

# Fit Nearest Neighbors (Cosine Similarity)
knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_model.fit(user_scheme_sparse)

# Recommendation Function
def recommend_user_based(partner_id, top_n=3):
    if partner_id not in user_scheme_matrix.index:
        return None

    idx = partner_id_lookup.index(partner_id)
    distances, indices = knn_model.kneighbors(user_scheme_sparse[idx], n_neighbors=min(top_n + 1, len(user_scheme_matrix)))
    similarities = 1 - distances.flatten()
    neighbors = indices.flatten()

    filtered = [(i, sim) for i, sim in zip(neighbors, similarities) if i != idx]
    if not filtered:
        return None

    top_idx, sim_score = filtered[0]
    similar_user = partner_id_lookup[top_idx]
    sim_score = round(sim_score, 6)

    top_schemes = (
        train_df[train_df["Partner_id"] == similar_user]["Scheme_Type"]
        .value_counts().head(3).index.tolist()
    )
    while len(top_schemes) < 3:
        top_schemes.append("No Scheme")

    product = train_df[train_df["Partner_id"] == partner_id]["Product_id"].unique()[0]

    return [partner_id, product, sim_score, *top_schemes]

# Generate Recommendations
user_partners = test_df["Partner_id"].unique()
user_recommendations = [recommend_user_based(pid) for pid in user_partners if recommend_user_based(pid)]

# Save Output
user_rec_df = pd.DataFrame(user_recommendations, columns=["Partner_id", "Product_id", "Similarity_Score", "Scheme_1", "Scheme_2", "Scheme_3"])
user_rec_df.to_csv("user_based_recommendations_with_geography_stockist.csv", index=False)

print("User-Based Recommendations saved with Geography and Stockist_Type features.")


User-Based Recommendations saved with Geography and Stockist_Type features.


In [9]:
def compute_feature_importance(feature_columns, num_shuffles=5, random_seed=42):
    importance_scores = {feature: [] for feature in feature_columns}

    # Generate Baseline Recommendations
    baseline_recommendations = {
        pid: recommend_user_based(pid) for pid in test_df["Partner_id"].unique()
    }

    # Convert to DataFrame
    baseline_df = pd.DataFrame(
        [v for v in baseline_recommendations.values() if v],
        columns=["Partner_id", "Product_id", "Similarity_Score", "Scheme_1", "Scheme_2", "Scheme_3"]
    ).set_index(["Partner_id", "Product_id"])

    for feature in feature_columns:
        for i in range(num_shuffles):
            perturbed_df = deepcopy(df)
            
            #  Set Random Seed for Stability in Each Shuffle
            np.random.seed(random_seed + i)
            perturbed_df[feature] = np.random.permutation(perturbed_df[feature].values)

            # Train-Test Split Again
            train_df_perturbed = perturbed_df.loc[train_df.index]

            # Recompute Engagement Score
            train_df_perturbed["Engagement_Score"] = np.log1p(train_df_perturbed["Sales_Value_Last_Period"]) * (
                train_df_perturbed["Feedback_Score"] + train_df_perturbed[geo_columns + stockist_columns].sum(axis=1)
            )

            # Pivot User-Scheme Matrix
            user_scheme_matrix_perturbed = train_df_perturbed.pivot_table(
                index="Partner_id", columns="Scheme_Type", values="Engagement_Score", aggfunc="sum", fill_value=0
            )

            # Add Geography & Stockist_Type Features
            user_features_perturbed = train_df_perturbed.groupby("Partner_id")[geo_columns + stockist_columns].sum()
            user_scheme_matrix_perturbed = user_scheme_matrix_perturbed.merge(
                user_features_perturbed, left_index=True, right_index=True, how="left"
            )

            # Prepare sparse matrix and retrain KNN
            user_scheme_sparse_perturbed = csr_matrix(user_scheme_matrix_perturbed.values)
            knn_model.fit(user_scheme_sparse_perturbed)

            # Generate New Recommendations
            perturbed_recommendations = {
                pid: recommend_user_based(pid) for pid in test_df["Partner_id"].unique()
            }

            # Convert to DataFrame
            perturbed_df = pd.DataFrame(
                [v for v in perturbed_recommendations.values() if v],
                columns=["Partner_id", "Product_id", "Similarity_Score", "Scheme_1", "Scheme_2", "Scheme_3"]
            ).set_index(["Partner_id", "Product_id"])

            # Compare Recommendations (Count only rows where all three schemes changed)
            scheme_columns = ["Scheme_1", "Scheme_2", "Scheme_3"]
            changed_recommendations = (baseline_df[scheme_columns] != perturbed_df[scheme_columns]).all(axis=1)
            changed_count = changed_recommendations.sum()

            # Normalize by total recommendations in baseline
            importance_scores[feature].append(changed_count / len(baseline_df))

    # Compute average importance score over multiple shuffles
    avg_importance_scores = {feature: np.mean(scores) for feature, scores in importance_scores.items()}

    return avg_importance_scores

# Compute Feature Importance for Geography & Stockist_Type
geo_importance = compute_feature_importance(geo_columns, num_shuffles=5, random_seed=42)
stockist_importance = compute_feature_importance(stockist_columns, num_shuffles=5, random_seed=42)

# Print Results
print("\nFeature Importance (Higher means more impact on recommendations):")
print("Geography Feature Importance:")
print(pd.Series(geo_importance).sort_values(ascending=False))

print("\nStockist_Type Feature Importance:")
print(pd.Series(stockist_importance).sort_values(ascending=False))



Feature Importance (Higher means more impact on recommendations):
Geography Feature Importance:
Geography_South    0.079208
Geography_West     0.073267
Geography_East     0.069307
Geography_North    0.069307
dtype: float64

Stockist_Type Feature Importance:
Stockist_Type_Retailer       0.10297
Stockist_Type_Wholesaler     0.10099
Stockist_Type_Distributor    0.09505
dtype: float64
