In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

In [2]:
customer = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

In [3]:
customer['SignupDate'] = pd.to_datetime(customer['SignupDate'])

In [4]:
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

In [5]:
df = transactions.merge(customer, on="CustomerID").merge(products, on="ProductID")

In [6]:
# Aggregate transaction data per customer
customer_features = df.groupby("CustomerID").agg(
    total_spend=('TotalValue', 'sum'),
    purchase_frequency=('TransactionID', 'count'),
    avg_order_value=('TotalValue', 'mean'),
    unique_products=('ProductID', 'nunique')
).reset_index()

In [7]:
# Convert categorical region into numerical features (one-hot encoding)
region_encoded = pd.get_dummies(customer[['CustomerID', 'Region']], columns=['Region'])

In [8]:
# Merge customer features with encoded regions
customer_profiles = pd.merge(customer_features, region_encoded, on="CustomerID")

In [9]:
scaler = StandardScaler()
numeric_features = ['total_spend', 'purchase_frequency', 'avg_order_value', 'unique_products']
customer_profiles[numeric_features] = scaler.fit_transform(customer_profiles[numeric_features])

# -------------------- Lookalike Model (KNN) --------------------

In [10]:
# Fit a Nearest Neighbors model
knn = NearestNeighbors(n_neighbors=4, metric='euclidean')  # 4 because first result is itself
knn.fit(customer_profiles[numeric_features])

In [11]:
# Find 3 most similar customers for each of the first 20 customers (C0001 - C0020)
lookalike_results = {}

for cust_id in customer['CustomerID'][:20]:  # First 20 customers
    if cust_id in customer_profiles['CustomerID'].values:
        idx = customer_profiles[customer_profiles['CustomerID'] == cust_id].index[0]
        distances, indices = knn.kneighbors([customer_profiles.loc[idx, numeric_features]])
        
        # Exclude the first result (which is the customer itself)
        similar_customers = [
            (customer_profiles.iloc[i]['CustomerID'], round(1 / (1 + d), 4))  # Convert distance to similarity score
            for i, d in zip(indices[0][1:], distances[0][1:])
        ]
        
        lookalike_results[cust_id] = similar_customers





In [12]:
# Convert results into DataFrame
lookalike_df = pd.DataFrame([
    {'CustomerID': cust, 'Lookalikes': str(similar)}
    for cust, similar in lookalike_results.items()
])

In [13]:
lookalike_df.to_csv("Lookalike.csv", index=False)

In [14]:
print(lookalike_df.head(10))

  CustomerID                                         Lookalikes
0      C0001  [('C0137', 0.9784), ('C0152', 0.9695), ('C0056...
1      C0002  [('C0029', 0.9164), ('C0199', 0.8785), ('C0031...
2      C0003  [('C0178', 0.9807), ('C0035', 0.8595), ('C0146...
3      C0004  [('C0021', 0.9529), ('C0173', 0.744), ('C0162'...
4      C0005  [('C0073', 0.9619), ('C0159', 0.9456), ('C0112...
5      C0006  [('C0079', 0.9723), ('C0117', 0.8087), ('C0158...
6      C0007  [('C0085', 0.9729), ('C0120', 0.858), ('C0042'...
7      C0008  [('C0109', 0.5373), ('C0147', 0.4844), ('C0084...
8      C0009  [('C0077', 0.9181), ('C0032', 0.748), ('C0043'...
9      C0010  [('C0029', 0.925), ('C0025', 0.8683), ('C0002'...
