In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
customers_df = pd.read_csv('Customers.csv')
products_df = pd.read_csv('Products.csv')
transactions_df = pd.read_csv('Transactions.csv')

In [4]:
merged_df = transactions_df.merge(customers_df, on='CustomerID')
merged_df = merged_df.merge(products_df, on='ProductID')
customer_product_matrix = merged_df.pivot_table(index='CustomerID', columns='ProductID', values='Quantity', fill_value=0)
customer_product_matrix_normalized = (customer_product_matrix - customer_product_matrix.mean()) / customer_product_matrix.std()

In [5]:
merged_df

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,T00630,C0031,P093,2024-10-08 23:58:14,2,609.88,304.94,Tina Miller,South America,2024-04-11,TechPro Vase,Home Decor,304.94
996,T00672,C0165,P044,2024-07-28 00:09:49,4,75.28,18.82,Juan Mcdaniel,South America,2022-04-09,ActiveWear Running Shoes,Clothing,18.82
997,T00711,C0165,P044,2024-06-11 15:51:14,4,75.28,18.82,Juan Mcdaniel,South America,2022-04-09,ActiveWear Running Shoes,Clothing,18.82
998,T00878,C0165,P044,2024-09-24 21:15:21,3,56.46,18.82,Juan Mcdaniel,South America,2022-04-09,ActiveWear Running Shoes,Clothing,18.82


In [8]:
merged_df = merged_df.drop('Price_x', axis=1)  
merged_df.rename(columns={'Price_y': 'Price'}, inplace=True)  

In [9]:
cosine_sim = cosine_similarity(customer_product_matrix_normalized)

In [12]:
def find_lookalike(customer_id, cosine_sim, n):
    idx = customers_df[customers_df['CustomerID'] == customer_id].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    lookalike = sim_scores[1:n+1]  # Exclude the customer itself
    return lookalike

lookalike_dict = {}
for customer_id in customers_df['CustomerID'][:20]:
    lookalike = find_lookalike(customer_id, cosine_sim, 3)
    lookalike_dict[customer_id] = [(customers_df['CustomerID'].iloc[l[0]], l[1]) for l in lookalike]

In [14]:
import json

with open('Lookalike.csv', 'w') as f:
    json.dump(lookalike_dict, f)