In [15]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

Load datasets

In [16]:
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

Merged datasets to include customer, product, and transaction details.

In [17]:
transactions_products = transactions.merge(products, on="ProductID", how="left")
merged_data = transactions_products.merge(customers, on="CustomerID", how="left")

Aggregated key metrics :

Total spending

Total quantity purchased

Average product price

Most purchased category

Customer's region

In [18]:

customer_profiles = merged_data.groupby("CustomerID").agg({
    "TotalValue": "sum",
    "Quantity": "sum",
    "Price_x": "mean",
    "Category": lambda x: x.mode()[0],
    "Region": "first"
}).reset_index()

Encode categorical features

In [19]:

customer_profiles = pd.get_dummies(customer_profiles, columns=["Category", "Region"], drop_first=True)


Normalize numerical features

In [20]:
scaler = StandardScaler()
numeric_cols = ["TotalValue", "Quantity", "Price_x"]
customer_profiles[numeric_cols] = scaler.fit_transform(customer_profiles[numeric_cols])


Similarity Calculation:
Using Compute cosine similarity
1. Measured how similar two customers are based on their profiles.

2. Recommended the top 3 customers with the highest similarity scores.

In [21]:

similarity_matrix = cosine_similarity(customer_profiles.drop("CustomerID", axis=1))


Create a lookalike map for the first 20 customers and then take the top 3 similar customers

In [22]:

lookalike_map = {}

for i, cust_id in enumerate(customer_profiles["CustomerID"][:20]):
    # Get similarity scores for the current customer
    similarity_scores = list(enumerate(similarity_matrix[i]))

    # Exclude self-similarity and sort by similarity score
    similar_customers = sorted(
        [(customer_profiles["CustomerID"][j], score) for j, score in similarity_scores if j != i],
        key=lambda x: x[1],
        reverse=True
    )


    lookalike_map[cust_id] = similar_customers[:3]


In [23]:
lookalike_map

{'C0001': [('C0181', 0.9346517961588328),
  ('C0120', 0.8975310829300924),
  ('C0184', 0.8640730287089359)],
 'C0002': [('C0088', 0.984786970122018),
  ('C0077', 0.900745855874733),
  ('C0144', 0.8961794025264799)],
 'C0003': [('C0031', 0.8639478762562338),
  ('C0025', 0.8621584309391507),
  ('C0052', 0.8448070519896925)],
 'C0004': [('C0165', 0.9825787774221347),
  ('C0153', 0.9127870619749094),
  ('C0169', 0.9081480442704979)],
 'C0005': [('C0140', 0.9899379450022263),
  ('C0186', 0.9758473072379672),
  ('C0146', 0.8995915629338853)],
 'C0006': [('C0187', 0.9497359319245032),
  ('C0168', 0.9371224523840914),
  ('C0126', 0.9150179444092312)],
 'C0007': [('C0146', 0.9958711037026335),
  ('C0115', 0.9630039760834646),
  ('C0186', 0.9317037582999351)],
 'C0008': [('C0136', 0.8469524598573369),
  ('C0018', 0.8206039030742583),
  ('C0065', 0.8106787250085865)],
 'C0009': [('C0061', 0.9682324770160674),
  ('C0198', 0.9456198660166513),
  ('C0103', 0.9194743627131416)],
 'C0010': [('C0111', 

Convert lookalike map to a DataFrame and save to CSV

In [24]:

lookalike_df = pd.DataFrame([
    {"cust_id": cust_id, "lookalikes": lookalikes}
    for cust_id, lookalikes in lookalike_map.items()
])

lookalike_df.to_csv("Shubhi_Goel_Lookalike.csv", index=False)


