In [22]:
import pandas as pd
import warnings
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import pairwise_distances

warnings.filterwarnings('ignore')


In [23]:
# Load data
df = pd.read_csv("stockist_data.csv")


In [24]:
# Define product columns (assumed to be in binary form)
product_cols = [
    "AIS(Air Insulated Switchgear)", "RMU(Ring Main Unit)", "PSS(Compact Sub-Stations)",
    "VCU(Vacuum Contactor Units)", "E-House", "VCB(Vacuum Circuit Breaker)",
    "ACB(Air Circuit Breaker)", "MCCB(Moduled Case Circuit Breaker)",
    "SDF(Switch Disconnectors)", "BBT(Busbar Trunking)", "Modular Switches"
]

In [25]:
# Split data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)


In [26]:
# Save train and test datasets to CSV
train_df.to_csv("train_data.csv", index=False)  # Save training data
test_df.to_csv("test_data.csv", index=False)    # Save testing data

print("Train and test data saved successfully.")


Train and test data saved successfully.


In [27]:
# Convert product purchase data to True/False format (needed because Jaccard similarity works with binary data)
df_products_train = train_df[product_cols].astype(bool)


In [28]:
# Compute Jaccard Similarity between products
# Convert the DataFrame to a NumPy array for faster computation
df_products_np = df_products_train.values
# Calculate Jaccard similarity between products (columns) based on purchase patterns
# '1 - distance' is used because pairwise_distances gives dissimilarity by default
jaccard_product_sim = 1 - pairwise_distances(df_products_np.T, metric="jaccard")


In [29]:
df_products_np

array([[ True, False, False, ..., False,  True, False],
       [ True, False,  True, ..., False,  True,  True],
       [ True, False, False, ..., False, False,  True],
       ...,
       [False,  True,  True, ..., False,  True, False],
       [False,  True, False, ..., False, False, False],
       [ True, False,  True, ..., False, False, False]])

In [30]:
# Convert to DataFrame for better readability
# Create a DataFrame from the Jaccard similarity matrix
# Set product names as row and column labels for easy interpretation
product_similarity_df = pd.DataFrame(jaccard_product_sim, index=product_cols, columns=product_cols)
# Display the product similarity matrix
print(product_similarity_df)

                                    AIS(Air Insulated Switchgear)  \
AIS(Air Insulated Switchgear)                            1.000000   
RMU(Ring Main Unit)                                      0.350498   
PSS(Compact Sub-Stations)                                0.351396   
VCU(Vacuum Contactor Units)                              0.333861   
E-House                                                  0.384615   
VCB(Vacuum Circuit Breaker)                              0.331683   
ACB(Air Circuit Breaker)                                 0.369355   
MCCB(Moduled Case Circuit Breaker)                       0.344156   
SDF(Switch Disconnectors)                                0.361874   
BBT(Busbar Trunking)                                     0.330017   
Modular Switches                                         0.325000   

                                    RMU(Ring Main Unit)  \
AIS(Air Insulated Switchgear)                  0.350498   
RMU(Ring Main Unit)                            1.0000

In [31]:
# Function to get top 3 similar products for a given product
# Check if the product exists in the similarity matrix
def get_top3_products(product_name):
    """Returns top 3 most similar products for a given product using Jaccard similarity."""
    if product_name not in product_similarity_df.index:
        return ["Product not found"]
       # Sort products by similarity in descending order and return the top 3 (excluding itself) 
    return list(product_similarity_df[product_name].sort_values(ascending=False)[1:4].index)


In [32]:
# Recommend products for each partner in the test set
# To store recommended product lists for each partner,recommendations
# To store corresponding similarity scores,similarity_scores
recommendations = []
similarity_scores = []


In [33]:
for index, row in test_df.iterrows():
    # Get the list of products that the partner has already purchased
    purchased_products = [product for product in product_cols if row[product] == 1]  # Products bought by partner
    
    # If no purchases, skip recommendation for this partner
    if not purchased_products:  
        recommendations.append([]) 
        similarity_scores.append([])
        continue
    
    recommended_products = set()# Use a set to avoid duplicates
    product_scores = [] # Store similarity scores of recommended products

    for product in purchased_products:
        # Get top 3 similar products for each purchased product
        top_products = get_top3_products(product)
        recommended_products.update(top_products) # Add them to the recommendation set
        # Get similarity scores for those top products
        scores = product_similarity_df.loc[product, top_products].values
        product_scores.extend(scores)

    # Store exactly 3 recommendations and their similarity scores
    recommendations.append(list(recommended_products)[:3] if len(recommended_products) >= 3 else list(recommended_products))
    # Store the corresponding similarity scores for those recommended products
    similarity_scores.append(product_scores[:3] if len(product_scores) >= 3 else product_scores)

    # Display intermediate results for first 5 partners
    if index < 5:
        print(f"\nRecommendations for Partner {row['Partner_id']}:")
        print(f"Purchased Products: {purchased_products}")
        print(f"Recommended Products: {recommendations[-1]}")
        print(f"Similarity Scores: {similarity_scores[-1]}")


In [34]:
# Save recommendations in DataFrame
test_df["Recommended_Products"] = recommendations
test_df["Similarity_Scores"] = similarity_scores
recommended_df = test_df[["Partner_id", "Recommended_Products", "Similarity_Scores"]]

# Display final recommendation results
print("\nFinal Partner Product Recommendations:")
print(recommended_df.head())

# Save recommendations
output_file = "Partner_Product_Recommendations.csv"
recommended_df.to_csv(output_file, index=False)

print(f"\nProduct recommendations saved to {output_file}")


Final Partner Product Recommendations:
    Partner_id                               Recommended_Products  \
521     522_20  [AIS(Air Insulated Switchgear), SDF(Switch Dis...   
737      738_5  [AIS(Air Insulated Switchgear), SDF(Switch Dis...   
740      741_1  [AIS(Air Insulated Switchgear), SDF(Switch Dis...   
660     661_86  [AIS(Air Insulated Switchgear), SDF(Switch Dis...   
411     412_58  [AIS(Air Insulated Switchgear), SDF(Switch Dis...   

                                     Similarity_Scores  
521  [0.3846153846153846, 0.36935483870967745, 0.36...  
737  [0.38344594594594594, 0.36142625607779577, 0.3...  
740  [0.3733333333333333, 0.3573825503355704, 0.350...  
660  [0.3733333333333333, 0.3573825503355704, 0.350...  
411  [0.3846153846153846, 0.36935483870967745, 0.36...  

Product recommendations saved to Partner_Product_Recommendations.csv
