In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
from copy import deepcopy
from sklearn.utils import resample
# Load dataset
base_dir = os.getcwd()
file_path = os.path.join(base_dir, "imbalance_dataset.csv")
df_imbalanced = pd.read_csv(file_path)


In [14]:
df_imbalanced

Unnamed: 0,Partner_id,Product_id,Scheme_Type,Sales_Value_Last_Period,Sales_Quantity_Last_Period,MRP,Growth_Percentage,Discount_Applied,Bulk_Purchase_Tendency,New_Stockist,Feedback_Score,Geography_East,Geography_North,Geography_South,Geography_West,Stockist_Type_Distributor,Stockist_Type_Retailer,Stockist_Type_Wholesaler
0,P1080,Controller,Loyalty Program,171456.106026,541,44934.095277,23.923448,2.202317,Medium,False,3.314019,0,1,0,0,0,1,0
1,P1002,PSS,Loyalty Points,195137.860000,460,48030.150000,42.210000,4.330000,Low,True,1.360000,0,0,1,0,0,1,0
2,P1003,Solar Solutions,Volume Discount,363902.790000,287,38712.810000,47.590000,14.450000,Medium,True,2.030000,0,1,0,0,0,0,1
3,P1003,Pump Starter and Controller,Cashback,128025.860000,82,27819.840000,44.140000,18.380000,Low,False,3.870000,0,0,1,0,1,0,0
4,P1069,Solar Solutions,Cashback,80109.136655,470,13132.007907,17.212704,15.392146,Low,True,3.985284,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,P1001,BBT,Volume Discount,421484.510000,13,27079.590000,-8.970000,6.200000,Low,False,1.470000,1,0,0,0,1,0,0
19996,P1003,PSS,Volume Discount,367087.170000,271,6682.310000,-7.480000,0.470000,High,True,4.290000,1,0,0,0,1,0,0
19997,P1003,PSS,Seasonal Offer,362132.120000,137,38455.450000,22.680000,16.880000,Medium,True,2.480000,0,1,0,0,1,0,0
19998,P1002,MCCB,Cashback,262679.200000,345,36131.160000,6.420000,10.460000,High,False,3.400000,0,0,1,0,0,0,1


In [15]:

# Identify geography and stockist type columns
geo_columns = [col for col in df_imbalanced.columns if col.startswith("Geography")]
stockist_columns = [col for col in df_imbalanced.columns if col.startswith("Stockist_Type")]

if not geo_columns or not stockist_columns:
    raise ValueError("No Geography or Stockist_Type features found after encoding! Check encoding step.")

# Ensure Sales_Value_Last_Period does not contain zeros to avoid log(0)
df_imbalanced["Sales_Value_Last_Period"] = df_imbalanced["Sales_Value_Last_Period"].replace(0, 1)

# Calculate Engagement Score
df_imbalanced["Engagement_Score"] = np.log1p(df_imbalanced["Sales_Value_Last_Period"]) * (df_imbalanced["Feedback_Score"] + df_imbalanced[geo_columns+stockist_columns].sum(axis=1))

# Train-Test Split
train_df, test_df = train_test_split(df_imbalanced, test_size=0.2, random_state=42, stratify=df_imbalanced["Partner_id"])

# Pivot User-Scheme Matrix using Engagement Score
user_scheme_matrix = train_df.pivot_table(
    index="Partner_id", columns="Scheme_Type", values="Engagement_Score", aggfunc="sum", fill_value=0
)

# Add Geography & Stockist_Type Features
user_features = train_df.groupby("Partner_id")[geo_columns + stockist_columns].mean()  # Aggregate features per Partner_id
user_scheme_matrix = user_scheme_matrix.merge(user_features, left_index=True, right_index=True, how="left")

# Prepare sparse matrix
user_scheme_sparse = csr_matrix(user_scheme_matrix.values)

partner_id_lookup = list(user_scheme_matrix.index)

# Fit Nearest Neighbors (Cosine Similarity)
knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_model.fit(user_scheme_sparse)

# Recommendation Function
def recommend_user_based(partner_id, top_n=3):
    if partner_id not in user_scheme_matrix.index:
        return None

    idx = partner_id_lookup.index(partner_id)
    distances, indices = knn_model.kneighbors(user_scheme_sparse[idx], n_neighbors=min(top_n + 1, len(user_scheme_matrix)))
    similarities = 1 - distances.flatten()
    neighbors = indices.flatten()

    filtered = [(i, sim) for i, sim in zip(neighbors, similarities) if i != idx]
    if not filtered:
        return None

    top_idx, sim_score = filtered[0]
    similar_user = partner_id_lookup[top_idx]
    sim_score = round(sim_score, 6)

    top_schemes = (
        train_df[train_df["Partner_id"] == similar_user]["Scheme_Type"]
        .value_counts().head(3).index.tolist()
    )
    while len(top_schemes) < 3:
        top_schemes.append("No Scheme")

    product = train_df[train_df["Partner_id"] == partner_id]["Product_id"].unique()[0]

    return [partner_id, product, sim_score, *top_schemes]

# Generate Recommendations
user_partners = test_df["Partner_id"].unique()
user_recommendations = [recommend_user_based(pid) for pid in user_partners if recommend_user_based(pid)]

# Save Output
user_rec_df = pd.DataFrame(user_recommendations, columns=["Partner_id", "Product_id", "Similarity_Score", "Scheme_1", "Scheme_2", "Scheme_3"])
user_rec_df.to_csv("user_based_recommendations_with_geography_stockist.csv", index=False)

print("User-Based Recommendations saved with Geography and Stockist_Type features.")


User-Based Recommendations saved with Geography and Stockist_Type features.


In [16]:
def compute_feature_importance(feature_columns, num_shuffles=5, random_seed=42):
    importance_scores = {feature: [] for feature in feature_columns}

    # Generate Baseline Recommendations
    baseline_recommendations = {
        pid: recommend_user_based(pid) for pid in test_df["Partner_id"].unique()
    }

    # Convert to DataFrame
    baseline_df = pd.DataFrame(
        [v for v in baseline_recommendations.values() if v],
        columns=["Partner_id", "Product_id", "Similarity_Score", "Scheme_1", "Scheme_2", "Scheme_3"]
    ).set_index(["Partner_id", "Product_id"])

    for feature in feature_columns:
        for i in range(num_shuffles):
            perturbed_df = deepcopy(df_imbalanced)
            
            #  Set Random Seed for Stability in Each Shuffle
            np.random.seed(random_seed + i)
            perturbed_df[feature] = np.random.permutation(perturbed_df[feature].values)

            # Train-Test Split Again
            train_df_perturbed = perturbed_df.loc[train_df.index]

            # Recompute Engagement Score
            train_df_perturbed["Engagement_Score"] = np.log1p(train_df_perturbed["Sales_Value_Last_Period"]) * (
                train_df_perturbed["Feedback_Score"] + train_df_perturbed[geo_columns + stockist_columns].sum(axis=1)
            )

            # Pivot User-Scheme Matrix
            user_scheme_matrix_perturbed = train_df_perturbed.pivot_table(
                index="Partner_id", columns="Scheme_Type", values="Engagement_Score", aggfunc="sum", fill_value=0
            )

            # Add Geography & Stockist_Type Features
            user_features_perturbed = train_df_perturbed.groupby("Partner_id")[geo_columns + stockist_columns].sum()
            user_scheme_matrix_perturbed = user_scheme_matrix_perturbed.merge(
                user_features_perturbed, left_index=True, right_index=True, how="left"
            )

            # Prepare sparse matrix and retrain KNN
            user_scheme_sparse_perturbed = csr_matrix(user_scheme_matrix_perturbed.values)
            knn_model.fit(user_scheme_sparse_perturbed)

            # Generate New Recommendations
            perturbed_recommendations = {
                pid: recommend_user_based(pid) for pid in test_df["Partner_id"].unique()
            }

            # Convert to DataFrame
            perturbed_df = pd.DataFrame(
                [v for v in perturbed_recommendations.values() if v],
                columns=["Partner_id", "Product_id", "Similarity_Score", "Scheme_1", "Scheme_2", "Scheme_3"]
            ).set_index(["Partner_id", "Product_id"])

            # Compare Recommendations (Count only rows where all three schemes changed)
            scheme_columns = ["Scheme_1", "Scheme_2", "Scheme_3"]
            changed_recommendations = (baseline_df[scheme_columns] != perturbed_df[scheme_columns]).all(axis=1)
            changed_count = changed_recommendations.sum()

            # Normalize by total recommendations in baseline
            importance_scores[feature].append(changed_count / len(baseline_df))

    # Compute average importance score over multiple shuffles
    avg_importance_scores = {feature: np.mean(scores) for feature, scores in importance_scores.items()}

    return avg_importance_scores

# Compute Feature Importance for Geography & Stockist_Type
geo_importance = compute_feature_importance(geo_columns, num_shuffles=5, random_seed=42)
stockist_importance = compute_feature_importance(stockist_columns, num_shuffles=5, random_seed=42)

# Print Results
print("\nFeature Importance (Higher means more impact on recommendations):")
print("Geography Feature Importance:")
print(pd.Series(geo_importance).sort_values(ascending=False))

print("\nStockist_Type Feature Importance:")
print(pd.Series(stockist_importance).sort_values(ascending=False))



Feature Importance (Higher means more impact on recommendations):
Geography Feature Importance:
Geography_East     0.041584
Geography_West     0.039604
Geography_North    0.037624
Geography_South    0.031683
dtype: float64

Stockist_Type Feature Importance:
Stockist_Type_Retailer       0.053465
Stockist_Type_Wholesaler     0.039604
Stockist_Type_Distributor    0.027723
dtype: float64


In [17]:
# Convert to DataFrame for saving to CSV
geo_importance_df = pd.DataFrame(list(geo_importance.items()), columns=["Feature", "Importance"])
stockist_importance_df = pd.DataFrame(list(stockist_importance.items()), columns=["Feature", "Importance"])

# Save results to CSV files
geo_importance_df.to_csv("geo_feature_importance_imbalanced.csv", index=False)
stockist_importance_df.to_csv("stockist_feature_importance_imbalanced.csv", index=False)
