In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import numpy as np

In [2]:
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

In [3]:
#merge dataset with products
transactions_products = transactions.merge(products, on="ProductID")

In [4]:
# Merge with customers
full_data = transactions_products.merge(customers, on="CustomerID")

In [5]:
print(full_data.head())

  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00761      C0199      P022  2024-10-01 05:57:09         4   
2        T00626      C0199      P079  2024-08-17 12:06:08         2   
3        T00963      C0199      P008  2024-10-26 00:01:58         2   
4        T00112      C0146      P067  2024-05-27 22:23:54         1   

   TotalValue  Price_x                      ProductName     Category  Price_y  \
0      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
1      550.16   137.54               HomeSense Wall Art   Home Decor   137.54   
2      834.74   417.37                   ActiveWear Rug   Home Decor   417.37   
3      293.70   146.85      BookWorld Bluetooth Speaker  Electronics   146.85   
4      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   

      CustomerName  Region  SignupDate  
0   Andrea Jenkins  Europe  2022-12-03  
1   

In [6]:
#aggregating data
customer_features = full_data.groupby("CustomerID").agg(
    TotalSpent=("TotalValue", "sum"),        
    AvgSpent=("TotalValue", "mean"),         
    ProductCount=("ProductID", "nunique"),   
    TotalTransactions=("TransactionID", "count"),  
    Categories=("Category", lambda x: x.mode()[0])
).reset_index()

In [7]:
customer_features = pd.get_dummies(customer_features, columns=["Categories"], prefix="Category")

In [8]:
print(customer_features.head())

  CustomerID  TotalSpent  AvgSpent  ProductCount  TotalTransactions  \
0      C0001     3354.52   670.904             5                  5   
1      C0002     1862.74   465.685             4                  4   
2      C0003     2725.38   681.345             4                  4   
3      C0004     5354.88   669.360             8                  8   
4      C0005     2034.24   678.080             3                  3   

   Category_Books  Category_Clothing  Category_Electronics  \
0               0                  0                     1   
1               0                  1                     0   
2               0                  0                     0   
3               1                  0                     0   
4               0                  0                     1   

   Category_Home Decor  
0                    0  
1                    0  
2                    1  
3                    0  
4                    0  


In [9]:
#customer similarities
scaler = StandardScaler()
features_scaled = scaler.fit_transform(customer_features.drop(columns=["CustomerID"]))

similarity_matrix = cosine_similarity(features_scaled)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features["CustomerID"], columns=customer_features["CustomerID"])
target_customers = customer_features["CustomerID"].iloc[:20]
lookalike_results = {}

for cust_id in target_customers:
    similar_customers = similarity_df[cust_id].sort_values(ascending=False).drop(cust_id).head(3)
    lookalike_results[cust_id] = [(similar_cust, score) for similar_cust, score in zip(similar_customers.index, similar_customers.values)]

In [10]:
#saving results into CSV file
lookalike_data = []

for cust_id, lookalikes in lookalike_results.items():
    lookalike_list = ", ".join([f"[{similar_cust}, {round(score, 2)}]" for similar_cust, score in lookalikes])
    lookalike_data.append({"cust_id": cust_id, "Lookalikes": lookalike_list})

lookalike_df = pd.DataFrame(lookalike_data)
lookalike_df.to_csv("Preetha_S_Lookalike.csv", index=False)

print("Lookalike model results saved to 'Preetha_S_Lookalike.csv'")

Lookalike model results saved to 'Preetha_S_Lookalike.csv'
