### Importing Libraries

In [10]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

### Loading Datasets

In [11]:
customers = pd.read_csv("csv files/Customers.csv")
products = pd.read_csv("csv files/Products.csv")
transactions = pd.read_csv("csv files/Transactions.csv")

### Merging Datasets

In [12]:
# Merging transactions, customers, and products data into a single dataset for comprehensive analysis.
merged_data = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")

### Creating User Profiles/ Feature Transaction

In [13]:

# Customer profiles include mean 'Quantity' and 'TotalValue' purchased, as well as their category preferences
customer_profiles = merged_data.groupby("CustomerID")[["Quantity", "TotalValue"]].mean()

### Encoding

In [14]:
# Creating one-hot encoded product category preferences for each customer
product_preferences = pd.get_dummies(merged_data[["CustomerID", "Category"]], columns=["Category"])
product_preferences = product_preferences.groupby("CustomerID").sum()

### Final Profiling

In [15]:
# Combined transaction features and product preferences into a final profile for each customer
final_profiles = pd.concat([customer_profiles, product_preferences], axis=1)

### Cosine Similarity + Similarity Matrix

In [16]:
# Computing Cosine Similarity for Customer Profiles
# Generating a similarity matrix based on the final customer profiles

similarity_matrix = cosine_similarity(final_profiles)
similarity_df = pd.DataFrame(similarity_matrix, index=final_profiles.index, columns=final_profiles.index)

### Lookalike Recommendations

In [17]:
# Generating Lookalike Recommendations
# For the first 20 customers (C0001 - C0020), finding the top-3 most similar customers

lookalike_results = {}
for customer in similarity_df.index[:20]:  # first 20 customers correspond to C0001 - C0020
    # Sort similarity scores in descending order and exclude the customer themselves
    similar_customers = similarity_df.loc[customer].sort_values(ascending=False).iloc[1:4]
    # Store top-3 similar customers and their scores in a dictionary
    lookalike_results[customer] = [(index, round(score, 4)) for index, score in zip(similar_customers.index, similar_customers.values)]

### Result Saving

In [18]:
# Saved Lookalike Results to CSV file.
# Converted the results dictionary into a format suitable for saving in to a CSV file.

lookalike_csv_data = [
    {"cust_id": customer, "recommendations": lookalike_results[customer]} for customer in lookalike_results
]
lookalike_df = pd.DataFrame(lookalike_csv_data)

lookalike_df.to_csv("Sathwik_Alladi_Lookalike.csv", index=False)

print("Lookalike Model --build SUCCESSFUL. Results saved at Sathwik_Alladi_Lookalike.csv.")

Lookalike Model --build SUCCESSFUL. Results saved at Sathwik_Alladi_Lookalike.csv.


The model successfully generates lookalike recommendations using cosine similarity, but the perfect similarity scores made me think that there would be room for refinement. Therefore, By enhancing feature engineering and addressing data quality, the model successfully generates recommendations but suffers from overfitting due to limited feature diversity.

If Given sufficient feature diversity, i believe through refining feature engineering and addressing data quality, the recommendations can become more actionable and business-relevant.

this is one of my insights gained through my experience, thought of sharing it. Thank you.