In [1]:
import pandas as pd
import numpy as np
import logging
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix


In [2]:
# ----------------------- Logging Setup -----------------------
logging.basicConfig(level=logging.INFO, format="%(asctime)s — %(levelname)s — %(message)s")
logger = logging.getLogger(__name__)

In [3]:
# ----------------------- Functions -----------------------

def load_and_prepare_data(filepath):
    logger.info("Loading dataset...")
    df = pd.read_csv(filepath)

    logger.info("Calculating Engagement Score...")
    df["Engagement_Score"] = np.log1p(df["Sales_Value_Last_Period"]) * (
        df["Feedback_Score"] + df["Growth_Percentage"]
    )
    logger.info("Engagement Score calculated.")
    return df

In [4]:
def split_data(df):
    logger.info("Splitting data into train and test sets...")
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["Partner_id"])
    logger.info(f"Train size: {len(train_df)}, Test size: {len(test_df)}")
    return train_df, test_df

In [None]:
# def encode_and_clean_stockist_data(df):
#     """
#     One-hot encodes 'Geography' and 'Stockist_Type' columns,
#     validates their presence post-encoding, and ensures that
#     'Sales_Value_Last_Period' has no zeros to prevent log(0) errors.

#     Parameters:
#         df (pd.DataFrame): Input DataFrame.

#     Returns:
#         pd.DataFrame: Preprocessed DataFrame.
#         list: Encoded geography column names.
#         list: Encoded stockist type column names.
#     """
#     # One-hot encoding
#     df = pd.get_dummies(df, columns=["Geography", "Stockist_Type"], dtype=int)

#     # Identify encoded columns
#     geo_columns = [col for col in df.columns if col.startswith("Geography_")]
#     stockist_columns = [col for col in df.columns if col.startswith("Stockist_Type_")]

#     # Validate that the encoding worked
#     if not geo_columns or not stockist_columns:
#         raise ValueError("No Geography or Stockist_Type features found after encoding! Check encoding step.")

#     # Handle potential zero values in sales
#     if "Sales_Value_Last_Period" in df.columns:
#         df["Sales_Value_Last_Period"] = df["Sales_Value_Last_Period"].replace(0, 1)

#     return df, geo_columns, stockist_columns


In [5]:
def create_user_scheme_matrix(train_df):
    logger.info("Creating user-scheme matrix...")
    matrix = train_df.pivot_table(
        index="Partner_id", columns="Scheme_Type", values="Engagement_Score", aggfunc="sum", fill_value=0
    )
    logger.info(f"User-Scheme matrix shape: {matrix.shape}")
    return matrix


In [6]:
def fit_knn_model(matrix):
    logger.info("Preparing sparse matrix and fitting NearestNeighbors model...")
    sparse_matrix = csr_matrix(matrix.values)
    knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
    knn_model.fit(sparse_matrix)
    logger.info("KNN model fitted.")
    return knn_model, sparse_matrix, list(matrix.index)


In [7]:
def recommend_user_based(partner_id, train_df, matrix, sparse_matrix, knn_model, id_lookup, top_n=3):
    if partner_id not in matrix.index:
        logger.warning(f"Partner ID {partner_id} not found in training data.")
        return None

    idx = id_lookup.index(partner_id)
    distances, indices = knn_model.kneighbors(sparse_matrix[idx], n_neighbors=min(top_n + 1, len(matrix)))
    similarities = 1 - distances.flatten()
    neighbors = indices.flatten()

    filtered = [(i, sim) for i, sim in zip(neighbors, similarities) if i != idx]
    if not filtered:
        logger.debug(f"No similar users found for Partner ID {partner_id}.")
        return None

    top_idx, sim_score = filtered[0]
    similar_user = id_lookup[top_idx]
    sim_score = round(sim_score, 6)

    top_schemes = (
        train_df[train_df["Partner_id"] == similar_user]["Scheme_Type"]
        .value_counts().head(3).index.tolist()
    )

    while len(top_schemes) < 3:
        top_schemes.append("No Scheme")

    product = train_df[train_df["Partner_id"] == partner_id]["Product_id"].unique()[0]

    return [partner_id, product, sim_score, *top_schemes]


In [8]:
def generate_user_recommendations(test_df, train_df, matrix, sparse_matrix, knn_model, id_lookup):
    logger.info("Generating recommendations for test users...")
    user_partners = test_df["Partner_id"].unique()
    recommendations = []

    for pid in user_partners:
        rec = recommend_user_based(pid, train_df, matrix, sparse_matrix, knn_model, id_lookup)
        if rec:
            recommendations.append(rec)

    logger.info(f"Generated {len(recommendations)} recommendations.")
    return pd.DataFrame(recommendations, columns=["Partner_id", "Product_id", "Similarity_Score", "Scheme_1", "Scheme_2", "Scheme_3"])


In [9]:
def save_recommendations(df, output_path):
    logger.info(f"Saving recommendations to {output_path}...")
    df.to_csv(output_path, index=False)
    logger.info("Recommendations saved successfully.")


In [12]:
# ----------------------- Main Pipeline -----------------------

def main(filepath, output_path="user_based_recommendations_enhanced.csv"):
    try:
        df = load_and_prepare_data(filepath)
        train_df, test_df = split_data(df)
        matrix = create_user_scheme_matrix(train_df)
        knn_model, sparse_matrix, id_lookup = fit_knn_model(matrix)
        recommendation_df = generate_user_recommendations(test_df, train_df, matrix, sparse_matrix, knn_model, id_lookup)
        save_recommendations(recommendation_df, output_path)
        logger.info("User-based recommendation pipeline completed successfully.")
    except Exception as e:
        logger.exception(f"Pipeline failed: {e}")

# ----------------------- Script Entry Point -----------------------

if __name__ == "__main__":
    main("New_Stockist_Data.csv")


2025-04-02 08:46:38,036 — INFO — Loading dataset...
2025-04-02 08:46:38,073 — INFO — Calculating Engagement Score...
2025-04-02 08:46:38,075 — INFO — Engagement Score calculated.
2025-04-02 08:46:38,075 — INFO — Splitting data into train and test sets...
2025-04-02 08:46:38,089 — INFO — Train size: 8000, Test size: 2000
2025-04-02 08:46:38,090 — INFO — Creating user-scheme matrix...
2025-04-02 08:46:38,176 — INFO — User-Scheme matrix shape: (3, 4)
2025-04-02 08:46:38,177 — INFO — Preparing sparse matrix and fitting NearestNeighbors model...
2025-04-02 08:46:38,178 — INFO — KNN model fitted.
2025-04-02 08:46:38,179 — INFO — Generating recommendations for test users...
2025-04-02 08:46:38,276 — INFO — Generated 3 recommendations.
2025-04-02 08:46:38,277 — INFO — Saving recommendations to user_based_recommendations_enhanced.csv...
2025-04-02 08:46:38,286 — INFO — Recommendations saved successfully.
2025-04-02 08:46:38,288 — INFO — User-based recommendation pipeline completed successfully.