In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer

# Load data
trans = pd.read_csv('transactions.csv')
cust = pd.read_csv('customers.csv')
prod = pd.read_csv('products.csv')

# Merge datasets
data = trans.merge(cust, on="CustomerID", how="left").merge(prod, on="ProductID", how="left")

# Convert dates
data['TransactionDate'] = pd.to_datetime(data['TransactionDate'])

# Feature Engineering
latest = data['TransactionDate'].max()

# Recency, Frequency, Monetary
recency = data.groupby('CustomerID')['TransactionDate'].max().reset_index()
recency['Recency'] = (latest - recency['TransactionDate']).dt.days

freq = data.groupby('CustomerID')['TransactionID'].nunique().reset_index()
freq.columns = ['CustomerID', 'Frequency']

monetary = data.groupby('CustomerID')['TotalValue'].sum().reset_index()
monetary.columns = ['CustomerID', 'Monetary']

# Categories Purchased
categories = data.groupby('CustomerID')['Category'].apply(lambda x: ','.join(x)).reset_index()
categories['CategoryList'] = categories['Category'].apply(lambda x: x.split(','))

# Encode categories
mlb = MultiLabelBinarizer()
encoded_cats = pd.DataFrame(mlb.fit_transform(categories['CategoryList']),
                            columns=mlb.classes_,
                            index=categories['CustomerID']).reset_index()

# Combine features
profiles = recency.merge(freq, on='CustomerID').merge(monetary, on='CustomerID').merge(encoded_cats, on='CustomerID')

# Normalize
scaler = StandardScaler()
profiles[['Recency', 'Frequency', 'Monetary']] = scaler.fit_transform(profiles[['Recency', 'Frequency', 'Monetary']])

# Drop TransactionDate column as it is not needed for similarity computation
profiles.drop(columns=['TransactionDate'], inplace=True)

# Compute Similarity
profiles.set_index('CustomerID', inplace=True)
sim_matrix = pd.DataFrame(cosine_similarity(profiles), index=profiles.index, columns=profiles.index)

# Lookalikes for customers C0001 to C0020
lookalikes = {}
for cust_id in cust['CustomerID'][:20]:
    if cust_id in sim_matrix.index:
        top_3 = sim_matrix.loc[cust_id].drop(cust_id).nlargest(3)
        lookalikes[cust_id] = list(zip(top_3.index, top_3.values))
    else:
        lookalikes[cust_id] = []

# Save Lookalike Data
rows = []
for cust_id, similars in lookalikes.items():
    row = {'CustomerID': cust_id}
    for i, (sim_id, score) in enumerate(similars, 1):
        row[f'Lookalike{i}'] = sim_id
        row[f'Score{i}'] = score
    rows.append(row)

pd.DataFrame(rows).to_csv("Srikant_Lookalike.csv", index=False)
print("Lookalikes saved.")