In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [3]:
merged_data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

In [4]:
merged_data

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,T00630,C0031,P093,2024-10-08 23:58:14,2,609.88,304.94,Tina Miller,South America,2024-04-11,TechPro Vase,Home Decor,304.94
996,T00672,C0165,P044,2024-07-28 00:09:49,4,75.28,18.82,Juan Mcdaniel,South America,2022-04-09,ActiveWear Running Shoes,Clothing,18.82
997,T00711,C0165,P044,2024-06-11 15:51:14,4,75.28,18.82,Juan Mcdaniel,South America,2022-04-09,ActiveWear Running Shoes,Clothing,18.82
998,T00878,C0165,P044,2024-09-24 21:15:21,3,56.46,18.82,Juan Mcdaniel,South America,2022-04-09,ActiveWear Running Shoes,Clothing,18.82


In [5]:
merged_data['Price'] = merged_data['Price_x']

In [6]:
customer_summary = merged_data.groupby('CustomerID').agg({'TotalValue': 'sum','TransactionID': 'count','Price': 'mean'}).rename(columns={'TotalValue': 'TotalSpend',
'TransactionID': 'TransactionCount','Price': 'AvgTransactionValue'
})

In [7]:
customer_summary = customer_summary.reset_index()
customer_summary.head()

Unnamed: 0,CustomerID,TotalSpend,TransactionCount,AvgTransactionValue
0,C0001,3354.52,5,278.334
1,C0002,1862.74,4,208.92
2,C0003,2725.38,4,195.7075
3,C0004,5354.88,8,240.63625
4,C0005,2034.24,3,291.603333


In [8]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
customer_summary_scaled = scaler.fit_transform(customer_summary[['TotalSpend', 'TransactionCount', 'AvgTransactionValue']])

In [9]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(customer_summary_scaled)
similarity_matrix[:5, :5]

array([[1.        , 0.99467775, 0.9960795 , 0.94602049, 0.9507304 ],
       [0.99467775, 1.        , 0.98597074, 0.93656512, 0.94939528],
       [0.9960795 , 0.98597074, 1.        , 0.96845813, 0.92217491],
       [0.94602049, 0.93656512, 0.96845813, 1.        , 0.79894386],
       [0.9507304 , 0.94939528, 0.92217491, 0.79894386, 1.        ]])

In [10]:
import numpy as np

lookalike_results = {}
for idx, customer_id in enumerate(customer_summary['CustomerID'][:20]):
    similar_indices = np.argsort(similarity_matrix[idx])[::-1][1:4]
    similar_customers = [(customer_summary['CustomerID'][i], similarity_matrix[idx][i]) for i in similar_indices]
    lookalike_results[customer_id] = similar_customers

In [12]:
import csv

with open('FirstName_LastName_Lookalike.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['CustomerID', 'SimilarCustomers'])
    for customer, lookalikes in lookalike_results.items():
        writer.writerow([customer, lookalikes])
