## Lookalike Model 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
Customers = pd.read_csv("Customers.csv")

In [3]:
Products = pd.read_csv("Products.csv")

In [4]:
Transactions = pd.read_csv("Transactions.csv")

In [5]:
Customers.shape, Products.shape, Transactions.shape

((200, 4), (100, 4), (1000, 7))

In [6]:
Customers.sample(5)

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
42,C0043,Sandy Short MD,Asia,2023-02-05
77,C0078,Julia Palmer,Asia,2024-11-13
31,C0032,Dustin Campbell,South America,2024-04-17
41,C0042,Heather Riley,North America,2023-03-15
63,C0064,Martha Montgomery,Europe,2023-10-22


In [7]:
Products.sample(5)

Unnamed: 0,ProductID,ProductName,Category,Price
37,P038,TechPro Vase,Home Decor,147.22
77,P078,HomeSense Rug,Home Decor,41.18
92,P093,TechPro Vase,Home Decor,304.94
15,P016,ActiveWear Running Shoes,Clothing,330.05
67,P068,TechPro Novel,Books,447.23


In [8]:
Transactions.sample(5)

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
486,T00514,C0032,P042,2024-11-27 20:42:52,1,379.44,379.44
792,T00354,C0070,P097,2024-05-15 00:22:27,4,1277.36,319.34
170,T00347,C0139,P040,2024-01-24 21:50:37,2,306.38,153.19
215,T00458,C0102,P061,2024-06-25 22:40:29,1,156.96,156.96
960,T00543,C0127,P098,2024-09-20 08:12:56,3,899.79,299.93


In [9]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [10]:
merged_data = pd.merge(Transactions, Customers, on='CustomerID')

In [11]:
merged_data.sample(5)

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price,CustomerName,Region,SignupDate
943,T00257,C0051,P091,2024-08-10 00:39:50,1,222.95,222.95,Nicholas Ellis,Europe,2023-12-21
906,T00733,C0059,P094,2024-02-06 22:06:42,1,447.34,447.34,Mrs. Kimberly Wright,North America,2024-04-07
229,T00235,C0116,P084,2024-01-31 11:17:38,2,675.82,337.91,James Martinez,North America,2024-09-11
453,T00877,C0137,P021,2024-01-31 15:36:46,2,671.68,335.84,Robert Gardner,South America,2024-04-09
767,T00641,C0173,P004,2024-01-02 15:02:39,3,287.07,95.69,Francisco Young,Asia,2022-10-06


### Creating Lookalike Model

In [12]:
customer_data = merged_data.groupby('CustomerID').agg({'TransactionID': 'count', 'TotalValue': 'sum', 'Quantity': 'sum'}).reset_index()

In [13]:
customer_data.rename(columns={'TransactionID': 'NumTransactions', 'TotalValue': 'TotalSpending', 'Quantity': 'TotalQuantity'}, inplace=True)

In [16]:
customer_data.sample(5)

Unnamed: 0,CustomerID,NumTransactions,TotalSpending,TotalQuantity
69,C0070,4,3125.49,12
38,C0039,6,4239.6,18
144,C0145,8,5771.27,20
16,C0017,8,4753.82,21
77,C0078,1,995.52,2


In [17]:
customer_profiles = pd.merge(customer_data, Customers[['CustomerID', 'Region', 'SignupDate']], on='CustomerID')

In [19]:
customer_profiles.sample(5)

Unnamed: 0,CustomerID,NumTransactions,TotalSpending,TotalQuantity,Region,SignupDate
100,C0101,8,5550.99,20,Asia,2023-09-30
0,C0001,5,3354.52,12,South America,2022-07-10
101,C0102,8,6132.36,21,South America,2022-09-16
170,C0171,5,5145.35,15,South America,2022-12-16
1,C0002,4,1862.74,10,Asia,2022-02-13


In [21]:
features = customer_profiles[['NumTransactions', 'TotalSpending', 'TotalQuantity']]

In [22]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

In [24]:
scaled_features

array([[-0.01145819, -0.06170143, -0.12203296],
       [-0.46749414, -0.87774353, -0.44800021],
       [-0.46749414, -0.40585722,  0.20393428],
       [ 1.35664965,  1.03254704,  1.67078689],
       [-0.92353008, -0.78392861, -0.93695108],
       [-0.46749414,  0.41587942, -0.12203296],
       [-0.92353008, -0.48548229, -0.77396745],
       [ 2.26872154,  0.43997044,  1.18183602],
       [-0.92353008, -1.40630171, -1.58888557],
       [-0.46749414, -0.9571662 , -0.12203296],
       [-0.01145819,  0.14369581,  0.04095066],
       [ 0.9006137 ,  0.96492372,  1.0188524 ],
       [ 0.9006137 ,  1.38575064,  0.85586877],
       [-1.83560198, -1.72239508, -1.75186919],
       [-1.37956603, -1.26353893, -1.42590195],
       [-0.01145819,  0.22269727, -0.44800021],
       [ 1.35664965,  0.70375173,  1.34481964],
       [-0.01145819,  0.71908486,  0.85586877],
       [ 0.44457776, -0.70101045, -0.28501659],
       [-1.83560198, -1.39212827, -1.58888557],
       [ 1.35664965,  1.0681256 ,  0.692

In [25]:
similarity_matrix = cosine_similarity(scaled_features)

In [27]:
recommendations = {}

In [28]:
for idx in range(20):
    customer_id = customer_profiles['CustomerID'].iloc[idx]

In [29]:
similarity_scores = similarity_matrix[idx]

In [30]:
similar_indices = np.argsort(similarity_scores)[-4:-1]

In [31]:
 recommendations[customer_id] = [(customer_profiles['CustomerID'].iloc[i], similarity_scores[i]) for i in similar_indices]

In [32]:
lookalike_df = pd.DataFrame({'CustomerID': list(recommendations.keys()),'Lookalikes': list(recommendations.values())})

In [35]:
lookalike_df

Unnamed: 0,CustomerID,Lookalikes
0,C0020,"[(C0063, 0.9993264511960818), (C0167, 0.999341..."


In [36]:
lookalike_output = []

for cust_id, lookalikes in recommendations.items():
    for lookalike_id, score in lookalikes:
        lookalike_output.append({'cust_id': cust_id, 'lookalike_id': lookalike_id, 'score': score})

In [37]:
lookalike_df = pd.DataFrame(lookalike_output)

In [38]:
lookalike_df.to_csv('Lookalike.csv', index=False)