In [1]:
import pandas as pd 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [2]:
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

In [3]:
# Step 1: Data Preparation
# Aggregate transactions to get customer-level metrics
merged_df = transactions.merge(products, on='ProductID', how='left').merge(customers, on='CustomerID', how='left')

customer_product_matrix = merged_df.pivot_table(
    index="CustomerID",
    columns="ProductName",
    values="TotalValue",
    aggfunc="sum",
    fill_value=0,
)

In [4]:
# Step 2: Feature Engineering
# Add customer-level features (Region and SignupDate)
customer_features = customers.set_index("CustomerID")[["Region"]]
customer_features = pd.get_dummies(customer_features)  # One-hot encode Region

In [5]:
# Combine customer-product matrix and customer features
customer_features_combined = customer_product_matrix.join(customer_features)

In [6]:
# Scale the features for similarity calculation
scaler = StandardScaler()
customer_features_scaled = scaler.fit_transform(customer_features_combined)

In [7]:
# Step 3: Similarity Calculation
# Compute pairwise cosine similarity
similarity_matrix = cosine_similarity(customer_features_scaled)
similarity_df = pd.DataFrame(
    similarity_matrix, index=customer_features_combined.index, columns=customer_features_combined.index
)

In [8]:
# Step 4: Find Top 3 Lookalikes
lookalike_map = {}

for customer_id in customers["CustomerID"].iloc[:20]:  # First 20 customers (C0001 - C0020)
    # Get similarity scores for the customer and sort by highest similarity (excluding self)
    similar_customers = (
        similarity_df[customer_id]
        .sort_values(ascending=False)
        .iloc[1:4]
    )  # Exclude the first one as it's the customer itself
    lookalike_map[customer_id] = list(zip(similar_customers.index, similar_customers.values))

In [11]:
# Step 5: Create Lookalike.csv
lookalike_df = pd.DataFrame.from_dict(
    {k: [{"cust_id": v[0], "score": v[1]} for v in val] for k, val in lookalike_map.items()},
    orient="index"
)
lookalike_df.to_csv("Lookalike.csv", index=False)

lookalike_map
print("Lookalike Model Completed and saved to Lookalike.csv")

Lookalike Model Completed and saved to Lookalike.csv
