In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [2]:
# Load datasets
customers = pd.read_csv("data/Customers.csv")
products = pd.read_csv("data/Products.csv")
transactions = pd.read_csv("data/Transactions.csv")

In [3]:
# Merge datasets
data = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")
data = data.drop(columns=["Price_y"]).rename(columns={"Price_x": "Price"})

In [4]:
# Encode categorical features
encoder = LabelEncoder()
data["Region"] = encoder.fit_transform(data["Region"])
data["Category"] = encoder.fit_transform(data["Category"])

In [5]:
# Aggregate transaction data per customer
customer_features = data.groupby("CustomerID").agg({
    "TotalValue": "sum",
    "Quantity": "sum",
    "Region": "first",
    "Category": lambda x: x.mode()[0]  # Most purchased category
}).reset_index()

In [6]:
# Normalize numerical features
customer_features[["TotalValue", "Quantity"]] = customer_features[["TotalValue", "Quantity"]].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

# Compute similarity matrix
feature_matrix = customer_features.drop(columns=["CustomerID"])
similarity_matrix = cosine_similarity(feature_matrix)

# Generate Lookalike recommendations
customer_ids = customer_features["CustomerID"].tolist()
lookalikes = {}

for i, cust_id in enumerate(customer_ids[:20]):  # First 20 customers
    similar_indices = np.argsort(similarity_matrix[i])[::-1][1:4]  # Top 3 similar customers
    similar_customers = [(customer_ids[idx], similarity_matrix[i][idx]) for idx in similar_indices]
    lookalikes[cust_id] = similar_customers

# Save as CSV
lookalike_df = pd.DataFrame.from_dict(lookalikes, orient='index')
lookalike_df.to_csv("Lookalike.csv", header=False)