In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
customers = pd.read_csv("P:\\Downloads\\Customers.csv")
products = pd.read_csv("P:\\Downloads\\Products.csv")
transactions = pd.read_csv("P:\\Downloads\\Transactions.csv")

#  Preprocessing
Merge transactions with customers and products

In [7]:
merged = pd.merge(transactions, customers, on='CustomerID')
merged = pd.merge(merged, products, on='ProductID')

In [9]:
merged.columns

Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price_x', 'CustomerName', 'Region',
       'SignupDate', 'ProductName', 'Category', 'Price_y'],
      dtype='object')

In [11]:
merged.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68


In [13]:
customer_profiles = merged.groupby('CustomerID').agg({
    'TotalValue': 'sum',                   # Total spending
    'Quantity': 'sum',                     # Total quantity purchased
    'Price_y': 'mean',                     # Average price of products bought (from the 'Products' dataset)
    'Category': lambda x: x.mode()[0],     # Most purchased category
    'Region': 'first'                      # Customer's region
}).reset_index()

One-hot encode categorical columns (Category and Region)

In [16]:
customer_profiles = pd.get_dummies(customer_profiles, columns=['Category', 'Region'])

Normalize numerical features

In [19]:
scaler = StandardScaler()
numerical_features = ['TotalValue', 'Quantity', 'Price_y']
customer_profiles[numerical_features] = scaler.fit_transform(customer_profiles[numerical_features])

Compute Cosine Similarity

In [22]:
customer_ids = customer_profiles['CustomerID']
feature_matrix = customer_profiles.drop('CustomerID', axis=1)

similarity_matrix = cosine_similarity(feature_matrix)

Get the top 3 similar customers for each customer

In [25]:
lookalike_results = {}
for idx, customer_id in enumerate(customer_ids):
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    sorted_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)[1:4]  # Exclude self-match
    top_3 = [(customer_ids[i], score) for i, score in sorted_scores]
    lookalike_results[customer_id] = top_3

Prepare the output Lookalike.csv file

In [28]:
output = []
for customer_id, lookalikes in lookalike_results.items():
    output.append({
        'cust_id': customer_id,
        'lookalikes': str([(l_id, round(score, 2)) for l_id, score in lookalikes])
    })

In [32]:
lookalike_df = pd.DataFrame(output)
lookalike_df.to_csv("P:\\Downloads\\Lookalike.csv", index=False)

print("Lookalike Model completed. Results saved to Lookalike.csv.")

Lookalike Model completed. Results saved to Lookalike.csv.
