In [None]:
import pandas as pd

# Load datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

# Preview datasets
print(customers.head())
print(products.head())
print(transactions.head())


  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3       

In [None]:
# Merge datasets
data = pd.merge(transactions, customers, on="CustomerID")
data = pd.merge(data, products, on="ProductID")
print(data.head())


  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue  Price_x     CustomerName         Region  SignupDate  \
0      300.68   300.68   Andrea Jenkins         Europe  2022-12-03   
1      300.68   300.68  Brittany Harvey           Asia  2024-09-04   
2      300.68   300.68  Kathryn Stevens         Europe  2024-04-04   
3      601.36   300.68  Travis Campbell  South America  2024-04-11   
4      902.04   300.68    Timothy Perez         Europe  2022-03-15   

                       ProductName     Category  Price_y  
0  ComfortLiving Bluetooth Speaker  Electronics   300.68  
1  ComfortLiving Bluetooth Speaker

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity

# One-hot encode categorical data
data_encoded = pd.get_dummies(data[['Region', 'Category']], drop_first=True)

# Aggregate data by CustomerID
customer_data = data_encoded.groupby(data['CustomerID']).mean()


In [None]:
# Compute similarity matrix
similarity_matrix = cosine_similarity(customer_data)

# Get top 3 lookalikes for the first 20 customers
import numpy as np
lookalike_map = {}
for i in range(20):
    customer_id = customer_data.index[i]
    similarity_scores = list(enumerate(similarity_matrix[i]))
    top_3 = sorted(similarity_scores, key=lambda x: -x[1])[1:4]  # Exclude self-similarity
    lookalike_map[customer_id] = [(customer_data.index[j], round(score, 2)) for j, score in top_3]

print(lookalike_map)


{'C0001': [('C0091', 0.99), ('C0096', 0.99), ('C0112', 0.98)], 'C0002': [('C0043', 1.0), ('C0134', 0.98), ('C0159', 0.95)], 'C0003': [('C0031', 1.0), ('C0158', 1.0), ('C0195', 0.99)], 'C0004': [('C0085', 1.0), ('C0113', 0.99), ('C0148', 0.99)], 'C0005': [('C0007', 1.0), ('C0146', 1.0), ('C0045', 0.95)], 'C0006': [('C0147', 0.99), ('C0187', 0.99), ('C0108', 0.99)], 'C0007': [('C0007', 1.0), ('C0146', 1.0), ('C0045', 0.95)], 'C0008': [('C0154', 1.0), ('C0189', 1.0), ('C0122', 0.99)], 'C0009': [('C0198', 0.98), ('C0044', 0.97), ('C0074', 0.97)], 'C0010': [('C0061', 1.0), ('C0009', 0.96), ('C0062', 0.96)], 'C0011': [('C0126', 1.0), ('C0032', 0.99), ('C0082', 0.99)], 'C0012': [('C0152', 1.0), ('C0104', 0.99), ('C0163', 0.99)], 'C0013': [('C0107', 1.0), ('C0032', 0.99), ('C0192', 0.99)], 'C0014': [('C0060', 1.0), ('C0089', 1.0), ('C0037', 0.97)], 'C0015': [('C0038', 1.0), ('C0160', 1.0), ('C0065', 0.99)], 'C0016': [('C0183', 1.0), ('C0049', 1.0), ('C0156', 0.99)], 'C0017': [('C0075', 1.0), (

In [None]:
lookalike_df = pd.DataFrame({
    "CustomerID": lookalike_map.keys(),
    "Lookalikes": [str(v) for v in lookalike_map.values()]
})
lookalike_df.to_csv("Pranith_Lookalike.csv", index=False)
