In [30]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [52]:
customers_df = pd.read_csv("/Users/shivanibalasubramani/Desktop/zeotap/Customers.csv")
transactions_df = pd.read_csv("/Users/shivanibalasubramani/Desktop/zeotap/Transactions.csv")


In [53]:
# Step 1: Create customer-level features based on transaction data

# Aggregate transaction data to customer-level features
agg_transactions = transactions_df.groupby('CustomerID').agg(
    total_spend=('TotalValue', 'sum'),
    num_transactions=('TransactionID', 'count'),
    avg_purchase_value=('Price', 'mean')
).reset_index()


In [54]:
customer_data = pd.merge(customers_df, agg_transactions, on='CustomerID', how='left')


In [55]:
customer_data = pd.get_dummies(customer_data, columns=['Region'], drop_first=True)

In [56]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_data.drop(['CustomerID', 'CustomerName', 'SignupDate'], axis=1))


In [57]:
print("Missing values before handling:")
print(customer_data.isnull().sum())

Missing values before handling:
CustomerID              0
CustomerName            0
SignupDate              0
total_spend             1
num_transactions        1
avg_purchase_value      1
Region_Europe           0
Region_North America    0
Region_South America    0
dtype: int64


In [58]:
customer_data = customer_data.fillna(0)

In [59]:
print("Missing values after handling:")
print(customer_data.isnull().sum())

Missing values after handling:
CustomerID              0
CustomerName            0
SignupDate              0
total_spend             0
num_transactions        0
avg_purchase_value      0
Region_Europe           0
Region_North America    0
Region_South America    0
dtype: int64


In [60]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_data.drop(['CustomerID', 'CustomerName', 'SignupDate'], axis=1))


In [61]:
cosine_sim = cosine_similarity(scaled_features)


In [62]:
lookalikes = {}
for idx, customer_id in enumerate(customer_data['CustomerID']):
    # Skip customers that are outside the range C0001 to C0020
    if customer_id >= 'C0021':
        continue

    # Get similarity scores for this customer with all others
    sim_scores = cosine_sim[idx]
    
    # Get the indices of the top 3 similar customers (excluding the customer itself)
    similar_indices = sim_scores.argsort()[-4:-1][::-1]
    
    # Get the CustomerIDs of the top 3 similar customers and their similarity scores
    similar_customers = [(customer_data['CustomerID'][i], sim_scores[i]) for i in similar_indices]
    
    # Store the results in the dictionary
    lookalikes[customer_id] = similar_customers

In [64]:
lookalike_df = pd.DataFrame([
    {'CustomerID': key, 'Lookalikes': str(val)} 
    for key, val in lookalikes.items()
])

In [65]:
lookalike_df.to_csv("Lookalike.csv", index=False)

In [66]:
print(lookalike_df.head())

  CustomerID                                         Lookalikes
0      C0001  [('C0137', 0.9966670044660734), ('C0152', 0.99...
1      C0002  [('C0142', 0.9838592031792338), ('C0043', 0.97...
2      C0003  [('C0025', 0.9670162100623091), ('C0112', 0.96...
3      C0004  [('C0113', 0.9865304001624996), ('C0108', 0.97...
4      C0005  [('C0140', 0.9817006972350526), ('C0123', 0.97...
