In [1]:
import pandas as pd
import numpy as np
import logging
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix


In [2]:
# ----------------------- Logging Setup -----------------------
logging.basicConfig(level=logging.INFO, format="%(asctime)s — %(levelname)s — %(message)s")
logger = logging.getLogger(__name__)

In [3]:
# ----------------------- Functions -----------------------

def load_and_prepare_data(filepath):
    logger.info("Loading dataset...")
    df = pd.read_csv(filepath)

    logger.info("Calculating Engagement Score...")
    df["Engagement_Score"] = np.log1p(df["Sales_Value_Last_Period"]) * (
        df["Feedback_Score"] + df["Growth_Percentage"]
    )
    logger.info("Engagement Score calculated.")
    return df

In [4]:
def split_data(df):
    logger.info("Splitting data into train and test sets...")
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["Partner_id"])
    logger.info(f"Train size: {len(train_df)}, Test size: {len(test_df)}")
    return train_df, test_df

In [5]:
# def encode_and_clean_stockist_data(df):
#     """
#     One-hot encodes 'Geography' and 'Stockist_Type' columns,
#     validates their presence post-encoding, and ensures that
#     'Sales_Value_Last_Period' has no zeros to prevent log(0) errors.

#     Parameters:
#         df (pd.DataFrame): Input DataFrame.

#     Returns:
#         pd.DataFrame: Preprocessed DataFrame.
#         list: Encoded geography column names.
#         list: Encoded stockist type column names.
#     """
#     # One-hot encoding
#     df = pd.get_dummies(df, columns=["Geography", "Stockist_Type"], dtype=int)

#     # Identify encoded columns
#     geo_columns = [col for col in df.columns if col.startswith("Geography_")]
#     stockist_columns = [col for col in df.columns if col.startswith("Stockist_Type_")]

#     # Validate that the encoding worked
#     if not geo_columns or not stockist_columns:
#         raise ValueError("No Geography or Stockist_Type features found after encoding! Check encoding step.")

#     # Handle potential zero values in sales
#     if "Sales_Value_Last_Period" in df.columns:
#         df["Sales_Value_Last_Period"] = df["Sales_Value_Last_Period"].replace(0, 1)

#     return df, geo_columns, stockist_columns


In [6]:
# def create_user_scheme_matrix(train_df):
#     logger.info("Creating user-scheme matrix...")
#     matrix = train_df.pivot_table(
#         index="Partner_id", columns="Scheme_Type", values="Engagement_Score", aggfunc="sum", fill_value=0
#     )
#     logger.info(f"User-Scheme matrix shape: {matrix.shape}")
#     return matrix
def create_user_scheme_matrix(train_df):
    logger.info("Creating user-scheme matrix using average engagement score...")
    matrix = train_df.pivot_table(
        index="Partner_id", columns="Scheme_Type", values="Engagement_Score", aggfunc="mean", fill_value=0
    )
    logger.info(f"User-Scheme matrix shape: {matrix.shape}")
    return matrix



In [7]:
def fit_knn_model(matrix):
    logger.info("Preparing sparse matrix and fitting NearestNeighbors model...")
    sparse_matrix = csr_matrix(matrix.values)
    knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
    knn_model.fit(sparse_matrix)
    logger.info("KNN model fitted.")
    return knn_model, sparse_matrix, list(matrix.index)


In [8]:
def recommend_user_based(partner_id, train_df, matrix, sparse_matrix, knn_model, id_lookup, top_n=3):
    if partner_id not in matrix.index:
        logger.warning(f"Partner ID {partner_id} not found in training data.")
        return None

    idx = id_lookup.index(partner_id)
    distances, indices = knn_model.kneighbors(sparse_matrix[idx], n_neighbors=min(top_n + 1, len(matrix)))
    similarities = 1 - distances.flatten()
    neighbors = indices.flatten()

    filtered = [(i, sim) for i, sim in zip(neighbors, similarities) if i != idx]
    if not filtered:
        logger.debug(f"No similar users found for Partner ID {partner_id}.")
        return None

    top_idx, sim_score = filtered[0]
    similar_user = id_lookup[top_idx]
    sim_score = round(sim_score, 6)

    top_schemes = (
        train_df[train_df["Partner_id"] == similar_user]["Scheme_Type"]
        .value_counts().head(3).index.tolist()
    )

    while len(top_schemes) < 3:
        top_schemes.append("No Scheme")

    product = train_df[train_df["Partner_id"] == partner_id]["Product_id"].unique()[0]

    return [partner_id, product, sim_score, *top_schemes]


In [9]:
def generate_user_recommendations(test_df, train_df, matrix, sparse_matrix, knn_model, id_lookup):
    logger.info("Generating recommendations for test users...")
    user_partners = test_df["Partner_id"].unique()
    recommendations = []

    for pid in user_partners:
        rec = recommend_user_based(pid, train_df, matrix, sparse_matrix, knn_model, id_lookup)
        if rec:
            recommendations.append(rec)

    logger.info(f"Generated {len(recommendations)} recommendations.")
    return pd.DataFrame(recommendations, columns=["Partner_id", "Product_id", "Similarity_Score", "Scheme_1", "Scheme_2", "Scheme_3"])


In [10]:
def save_recommendations(df, output_path):
    logger.info(f"Saving recommendations to {output_path}...")
    df.to_csv(output_path, index=False)
    logger.info("Recommendations saved successfully.")


In [11]:
# ----------------------- Main Pipeline -----------------------

def main(filepath, output_path="user_based_recommendations_enhanced.csv"):
    try:
        df = load_and_prepare_data(filepath)
        train_df, test_df = split_data(df)
        matrix = create_user_scheme_matrix(train_df)
        knn_model, sparse_matrix, id_lookup = fit_knn_model(matrix)
        recommendation_df = generate_user_recommendations(test_df, train_df, matrix, sparse_matrix, knn_model, id_lookup)
        save_recommendations(recommendation_df, output_path)
        logger.info("User-based recommendation pipeline completed successfully.")
    except Exception as e:
        logger.exception(f"Pipeline failed: {e}")

# ----------------------- Script Entry Point -----------------------

if __name__ == "__main__":
    main("Augmented_Stockist_Data.csv")


2025-04-17 13:09:27,644 — INFO — Loading dataset...
2025-04-17 13:09:27,697 — INFO — Calculating Engagement Score...
2025-04-17 13:09:27,700 — INFO — Engagement Score calculated.
2025-04-17 13:09:27,701 — INFO — Splitting data into train and test sets...
2025-04-17 13:09:27,730 — INFO — Train size: 16000, Test size: 4000
2025-04-17 13:09:27,731 — INFO — Creating user-scheme matrix using average engagement score...
2025-04-17 13:09:27,749 — INFO — User-Scheme matrix shape: (101, 5)
2025-04-17 13:09:27,750 — INFO — Preparing sparse matrix and fitting NearestNeighbors model...
2025-04-17 13:09:27,751 — INFO — KNN model fitted.
2025-04-17 13:09:27,753 — INFO — Generating recommendations for test users...
2025-04-17 13:09:28,359 — INFO — Generated 101 recommendations.
2025-04-17 13:09:28,361 — INFO — Saving recommendations to user_based_recommendations_enhanced.csv...
2025-04-17 13:09:28,375 — INFO — Recommendations saved successfully.
2025-04-17 13:09:28,376 — INFO — User-based recommendat

Evaluation Code

In [12]:
# Import necessary library
import pandas as pd


In [13]:
def prepare_scheme_data(test_path: str, reco_path: str) -> pd.DataFrame:
    """
    Loads, prepares, and merges availed schemes and recommended schemes for each partner.

    Parameters:
        test_path (str): Path to the test data CSV file (long format).
        reco_path (str): Path to the recommendation CSV file.

    Returns:
        pd.DataFrame: DataFrame with Partner_id, Availed_Schemes, Recommended_Schemes.
    """
    # Load input files
    test_df = pd.read_csv(test_path)
    rec_df = pd.read_csv(reco_path)

    # Group test data by Partner_id to get list of availed schemes
    availed_df = (
        test_df.groupby("Partner_id")["Scheme_Type"]
        .apply(list)
        .reset_index()
        .rename(columns={"Scheme_Type": "Availed_Schemes"})
        )

    # Combine Scheme_1, Scheme_2, Scheme_3 into a list column
    rec_df['Recommended_Schemes'] = rec_df[['Scheme_1', 'Scheme_2', 'Scheme_3']].values.tolist()

    # Merge both dataframes on Partner_id
    df_all = pd.merge(
        availed_df,
        rec_df[['Partner_id', 'Recommended_Schemes']],
        on='Partner_id',
        how='left'
    )

    # Ensure lists are well-formed
    df_all['Availed_Schemes'] = df_all['Availed_Schemes'].apply(lambda x: x if isinstance(x, list) else [])
    df_all['Recommended_Schemes'] = df_all['Recommended_Schemes'].apply(lambda x: x if isinstance(x, list) else [])
    return df_all
    


In [14]:
def evaluate_per_scheme(df: pd.DataFrame, recommendation_col: str = 'Recommended_Schemes', k_list=[1, 2, 3]) -> list:
    """
    Evaluates per-scheme Top-K precision, recall, and F1 score.

    Parameters:
        df (pd.DataFrame): DataFrame with availed and recommended schemes per partner.
        recommendation_col (str): Column name of the recommendation list.
        k_list (list): List of Top-K values to evaluate.

    Returns:
        list: List of dictionaries with metrics per K.
    """
    results = []

    for k in k_list:
        precision_list = []
        recall_list = []

        for _, row in df.iterrows():
            actual_set = set(row['Availed_Schemes'])
            recommended_k = row[recommendation_col][:k]  # Do not use set here!

            if not actual_set:
                continue  # Skip if no ground truth

            # Count number of correct schemes in Top-K
            tp = sum([1 for scheme in recommended_k if scheme in actual_set])
            precision = tp / k
            recall = tp / len(actual_set)

            precision_list.append(precision)
            recall_list.append(recall)

        # Average metrics
        avg_precision = round(sum(precision_list) / len(precision_list), 4) if precision_list else 0
        avg_recall = round(sum(recall_list) / len(recall_list), 4) if recall_list else 0
        f1 = round(2 * avg_precision * avg_recall / (avg_precision + avg_recall), 4) if (avg_precision + avg_recall) else 0

        results.append({
            "Top-K": k,
            "Avg Precision": avg_precision,
            "Avg Recall": avg_recall,
            "Avg F1 Score": f1
        })
    return results

In [15]:
# Load and prepare the data
test_path = "test_data.csv"
reco_path = "user_based_recommendations_enhanced.csv"

df_all = prepare_scheme_data(test_path, reco_path)

# Run per-scheme evaluation (no filtering of availed schemes)
results = evaluate_per_scheme(df_all, recommendation_col='Recommended_Schemes')


In [16]:
results

[{'Top-K': 1,
  'Avg Precision': 0.9802,
  'Avg Recall': 0.2452,
  'Avg F1 Score': 0.3923},
 {'Top-K': 2,
  'Avg Precision': 0.9851,
  'Avg Recall': 0.4937,
  'Avg F1 Score': 0.6578},
 {'Top-K': 3,
  'Avg Precision': 0.9901,
  'Avg Recall': 0.7455,
  'Avg F1 Score': 0.8506}]