In [1]:
import pandas as pd
import numpy as np
import logging
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import jaccard_score
from sklearn.model_selection import train_test_split


In [2]:
# ----------------------- Logging Setup -----------------------
# Configure logging to show timestamp, log level, and message
logging.basicConfig(level=logging.INFO, format="%(asctime)s — %(levelname)s — %(message)s")
logger = logging.getLogger(__name__)


In [3]:
# ----------------------- Functions -----------------------

def load_and_split_data(filepath, test_size=0.2, random_state=42):
    logger.info("Loading data from file...")
    df = pd.read_csv(filepath)
    logger.info(f"Dataset loaded with {df.shape[0]} rows and {df.shape[1]} columns.")
    
    train_df, test_df = train_test_split(df, test_size=test_size, random_state=random_state)
    logger.info(f"Data split into train ({len(train_df)}) and test ({len(test_df)}) sets.")
    return train_df, test_df

In [4]:
def create_scheme_matrix(train_df):
    logger.info("Creating scheme-product association matrix...")
    grouped_schemes = train_df.groupby('Scheme_Type')['Product_id'].apply(set).reset_index()
    
    mlb = MultiLabelBinarizer()
    scheme_matrix = pd.DataFrame(
        mlb.fit_transform(grouped_schemes['Product_id']),
        index=grouped_schemes['Scheme_Type'],
        columns=mlb.classes_
    )
    
    logger.info(f"Scheme matrix created with shape: {scheme_matrix.shape}")
    return scheme_matrix

In [None]:
# def preprocess_stockist_data(train_df):
#     """
#     Preprocess the training DataFrame by:
#     - One-hot encoding 'Geography' and 'Stockist_Type' columns.
#     - Validating that encoded columns exist.
#     - Replacing 0 in 'Sales_Value_Last_Period' to avoid log(0) errors.
    
#     Returns:
#         Processed DataFrame, list of geo columns, list of stockist type columns
#     """
#     # One-hot encode Geography and Stockist_Type
#     train_df = pd.get_dummies(train_df, columns=["Geography", "Stockist_Type"], dtype=int)

#     # Identify the newly created one-hot columns
#     geo_columns = [col for col in train_df.columns if col.startswith("Geography_")]
#     stockist_columns = [col for col in train_df.columns if col.startswith("Stockist_Type_")]

#     # Validation
#     if not geo_columns or not stockist_columns:
#         raise ValueError("One-hot encoding failed: 'Geography_' or 'Stockist_Type_' columns not found.")

#     # Replace zeros in Sales_Value_Last_Period to avoid log(0)
#     if "Sales_Value_Last_Period" in train_df.columns:
#         train_df["Sales_Value_Last_Period"] = train_df["Sales_Value_Last_Period"].replace(0, 1)

#     return train_df, geo_columns, stockist_columns


In [5]:
def compute_similarity_matrix(scheme_matrix):
    logger.info("Computing Jaccard similarity matrix...")
    similarity_matrix = pd.DataFrame(index=scheme_matrix.index, columns=scheme_matrix.index, dtype=float)

    for i in range(len(scheme_matrix)):
        for j in range(len(scheme_matrix)):
            if i != j:
                similarity = jaccard_score(scheme_matrix.iloc[i], scheme_matrix.iloc[j])
                similarity_matrix.iloc[i, j] = similarity
            else:
                similarity_matrix.iloc[i, j] = 1.0
    
    logger.info("Similarity matrix computation completed.")
    return similarity_matrix

In [6]:
def recommend_similar_schemes(similarity_matrix, scheme_name, top_n=3):
    if scheme_name not in similarity_matrix.index:
        logger.warning(f"Scheme '{scheme_name}' not found in similarity matrix.")
        return []

    similarities = similarity_matrix.loc[scheme_name].drop(scheme_name)
    return similarities.sort_values(ascending=False).head(top_n).index.tolist()


In [7]:
def generate_recommendations(test_df, similarity_matrix, top_n=3):
    logger.info("Generating scheme recommendations for test data...")
    recommendations = []

    for product in test_df["Product_id"].unique():
        product_schemes = test_df[test_df["Product_id"] == product]["Scheme_Type"].unique()
        
        for scheme in product_schemes:
            if scheme in similarity_matrix.index:
                similar_schemes = similarity_matrix.loc[scheme].drop(scheme).sort_values(ascending=False).head(top_n)
                recommendations.append({
                    "Product_id": product,
                    "Similarity_Scores": round(similar_schemes.mean(), 6),
                    "Scheme_1": similar_schemes.index[0] if len(similar_schemes) > 0 else None,
                    "Scheme_2": similar_schemes.index[1] if len(similar_schemes) > 1 else None,
                    "Scheme_3": similar_schemes.index[2] if len(similar_schemes) > 2 else None,
                })
            else:
                logger.debug(f"Scheme '{scheme}' for product '{product}' not in training similarity matrix.")

    logger.info(f"Total recommendations generated: {len(recommendations)}")
    return pd.DataFrame(recommendations)


In [8]:
def save_recommendations(recommendation_df, output_path):
    logger.info(f"Saving recommendations to {output_path}...")
    recommendation_df.to_csv(output_path, index=False)
    logger.info("Recommendations saved successfully.")

In [9]:
# ----------------------- Main Pipeline -----------------------

def main(filepath, output_path="Scheme_Recommendations.csv"):
    try:
        train_df, test_df = load_and_split_data(filepath)
        scheme_matrix = create_scheme_matrix(train_df)
        similarity_matrix = compute_similarity_matrix(scheme_matrix)
        recommendation_df = generate_recommendations(test_df, similarity_matrix)
        save_recommendations(recommendation_df, output_path)
        logger.info("Pipeline executed successfully.")
    except Exception as e:
        logger.exception(f"An error occurred during pipeline execution: {e}")


In [11]:
# ----------------------- Script Entry Point -----------------------

if __name__ == "__main__":
    main("Augmented_Stockist_data.csv")

2025-04-07 08:30:55,643 — INFO — Loading data from file...
2025-04-07 08:30:55,688 — INFO — Dataset loaded with 15090 rows and 13 columns.
2025-04-07 08:30:55,693 — INFO — Data split into train (12072) and test (3018) sets.
2025-04-07 08:30:55,694 — INFO — Creating scheme-product association matrix...
2025-04-07 08:30:55,700 — INFO — Scheme matrix created with shape: (4, 15)
2025-04-07 08:30:55,700 — INFO — Computing Jaccard similarity matrix...
2025-04-07 08:30:55,738 — INFO — Similarity matrix computation completed.
2025-04-07 08:30:55,740 — INFO — Generating scheme recommendations for test data...
2025-04-07 08:30:55,784 — INFO — Total recommendations generated: 30
2025-04-07 08:30:55,785 — INFO — Saving recommendations to Scheme_Recommendations.csv...
2025-04-07 08:30:55,793 — INFO — Recommendations saved successfully.
2025-04-07 08:30:55,794 — INFO — Pipeline executed successfully.
