Connect with the Drive

In [72]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Libraries & Imports

In [73]:
import numpy as np
import pandas as pd

Load Datasets

In [74]:
customer_data = pd.read_csv("/content/drive/MyDrive/Customers.csv")
product_data = pd.read_csv("/content/drive/MyDrive/Products.csv")
transaction_data = pd.read_csv("/content/drive/MyDrive/Transactions.csv")

Data preprocessing

In [75]:
# Merge df to combine given Customer, Product and Transaction data
data1 = pd.merge(transaction_data, customer_data, on='CustomerID')
data = pd.merge(data1, product_data, on='ProductID')

In [76]:
data

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,T00496,C0118,P037,2024-10-24 08:30:27,1,459.86,459.86,Jacob Holt,South America,2022-01-22,SoundWave Smartwatch,Electronics,459.86
996,T00759,C0059,P037,2024-06-04 02:15:24,3,1379.58,459.86,Mrs. Kimberly Wright,North America,2024-04-07,SoundWave Smartwatch,Electronics,459.86
997,T00922,C0018,P037,2024-04-05 13:05:32,4,1839.44,459.86,Tyler Haynes,North America,2024-09-21,SoundWave Smartwatch,Electronics,459.86
998,T00959,C0115,P037,2024-09-29 10:16:02,2,919.72,459.86,Joshua Hamilton,Asia,2024-11-11,SoundWave Smartwatch,Electronics,459.86


In [77]:
data.columns

Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price_x', 'CustomerName', 'Region',
       'SignupDate', 'ProductName', 'Category', 'Price_y'],
      dtype='object')

In [78]:
# Feature Engineering - aggregate customer transaction data
customer_feats = data.groupby('CustomerID').agg({
    'TotalValue' : 'sum',
    'Quantity' : 'sum',
    'ProductID' : lambda x: list(x.unique()),  # Product IDs purchased
    'Category' : lambda x: list(x.unique())  # Product categories purchased

}).reset_index()

customer_feats.head(5)

Unnamed: 0,CustomerID,TotalValue,Quantity,ProductID,Category
0,C0001,3354.52,12,"[P054, P022, P096, P083, P029]","[Books, Home Decor, Electronics]"
1,C0002,1862.74,10,"[P095, P004, P019, P071]","[Home Decor, Clothing]"
2,C0003,2725.38,14,"[P025, P006, P035, P002]","[Home Decor, Clothing, Electronics]"
3,C0004,5354.88,23,"[P049, P053, P038, P025, P097, P024, P008, P077]","[Books, Home Decor, Electronics]"
4,C0005,2034.24,7,"[P025, P039, P012]","[Home Decor, Electronics]"


In [79]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [80]:
def most_frequent_category(categories):
    """getting the most frequently purchased category for a customer."""
    if isinstance(categories, list):
        return max(set(categories), key=categories.count)
    return categories

In [81]:
customer_features = data.groupby("CustomerID").agg({
    'TotalValue': 'sum',                  # Total amount spent by the customer
    'Quantity': 'sum',                   # Total quantity of products purchased
    'Category': lambda x: most_frequent_category(x.tolist())  # Most frequent category
}).reset_index()

In [82]:
customer_features

Unnamed: 0,CustomerID,TotalValue,Quantity,Category
0,C0001,3354.52,12,Electronics
1,C0002,1862.74,10,Clothing
2,C0003,2725.38,14,Home Decor
3,C0004,5354.88,23,Books
4,C0005,2034.24,7,Electronics
...,...,...,...,...
194,C0196,4982.88,12,Home Decor
195,C0197,1928.65,9,Electronics
196,C0198,931.83,3,Clothing
197,C0199,1979.28,9,Home Decor


In [83]:
# Applying Scaling on two columns - TotalValue and Quantity

scaler = StandardScaler()
customer_features[['TotalValue', 'Quantity']] = scaler.fit_transform(customer_features[['TotalValue', 'Quantity']])

In [84]:
# Applying One-hot encoding on product categories

encoder = OneHotEncoder(sparse_output=False)
category_encoded = encoder.fit_transform(customer_features[['Category']])

In [85]:
# Merge all features into a final customer profile vector

customer_profiles = np.hstack([
    customer_features[['TotalValue', 'Quantity']].values,  # Numerical features
    category_encoded                                       # Encoded categorical features
])

In [86]:
from sklearn.metrics.pairwise import cosine_similarity

In [87]:
# Calculate Similarity
cosine_sim = cosine_similarity(customer_profiles)

In [88]:
# Generate Lookalikes

lookalikes = {}

for idx, customer_id in enumerate(customer_features['CustomerID']):
    similarities = list(zip(customer_features['CustomerID'], cosine_sim[idx]))
    similarities.sort(key=lambda x: x[1], reverse=True)  # Sort by similarity score
    top3_similar = [(cust_id, score) for cust_id, score in similarities[1:4]]  # Top 3 excluding self
    lookalikes[customer_id] = top3_similar


In [89]:
print(lookalikes)

{'C0001': [('C0026', 0.9944375290336099), ('C0184', 0.9876079671017255), ('C0127', 0.9860433484016156)], 'C0002': [('C0029', 0.9997368422264425), ('C0088', 0.9960799027166513), ('C0062', 0.9827979759999989)], 'C0003': [('C0160', 0.9953407167187239), ('C0086', 0.9888814034840862), ('C0038', 0.9888263190913459)], 'C0004': [('C0175', 0.9930197186253533), ('C0017', 0.9918782027309466), ('C0075', 0.9896356228258206)], 'C0005': [('C0186', 0.9969474860655397), ('C0192', 0.9964358373774421), ('C0112', 0.9950756043735618)], 'C0006': [('C0117', 0.9954111442890659), ('C0064', 0.9845049485871493), ('C0187', 0.9790254134285984)], 'C0007': [('C0146', 0.9999942326269695), ('C0120', 0.9924774477208783), ('C0005', 0.9847182182201702)], 'C0008': [('C0136', 0.9980967511817108), ('C0124', 0.9973048459008356), ('C0113', 0.9879801656527413)], 'C0009': [('C0198', 0.9999780423996943), ('C0150', 0.9991889487691662), ('C0058', 0.9977930533082471)], 'C0010': [('C0176', 0.9786286274192945), ('C0029', 0.9733484205

Save to .csv file

In [90]:
lookalike_data = pd.DataFrame(lookalikes.items(), columns=['cust_id', 'lookalikes'])
# lookalike_data
lookalike_data.to_csv("Rishav_Lookalike.csv", index=False)