## ***************** Lookalike Model *****************
### To build a lookalike model which provides recommendations based on customer profile and transation history

In [75]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [76]:
# Read the data
customer_df=pd.read_csv("Customers.csv")
product_df=pd.read_csv("Products.csv")
transactions_df=pd.read_csv("Transactions.csv")

### Data Description

1. customers data

* CustomerID: Unique identifier for each customer.
* CustomerName: Name of the customer.
* Region: Continent where the customer resides.
* SignupDate: Date when the customer signed up.

2. Products data

* ProductID: Unique identifier for each product.
* ProductName: Name of the product.
* Category: Product category.
* Price: Product price in USD.

3. Transactions data

* TransactionID: Unique identifier for each transaction.
* CustomerID: ID of the customer who made the transaction.
* ProductID: ID of the product sold.
* TransactionDate: Date of the transaction.

### View data 

In [77]:
#Customer data
customer_df.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15


In [78]:
# Product data
product_df.head()

Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.3
1,P002,ActiveWear Smartwatch,Electronics,346.3
2,P003,ComfortLiving Biography,Books,44.12
3,P004,BookWorld Rug,Home Decor,95.69
4,P005,TechPro T-Shirt,Clothing,429.31


In [79]:
# Transaction data
transactions_df.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68


### Data Preprocessing

In [80]:
# Check for missing values
print(f'The number of missing values in customer data: {customer_df.isnull().sum().sum()}')
print(f'The number of missing values in product data: {product_df.isnull().sum().sum()}')
print(f'The number of missing values in transactions data: {transactions_df.isnull().sum().sum()}')

The number of missing values in customer data: 0
The number of missing values in product data: 0
The number of missing values in transactions data: 0


* Combine Customer profiles with their transaction data and product information

In [81]:
# Merge transactions with customer info
transactions_df=transactions_df.merge(customer_df[['CustomerID','Region']],on='CustomerID',how='left')

# Merge transactions with product info to get product details
transactions_df=transactions_df.merge(product_df,on='ProductID',how='left')

# Rename columns and drop duplicates
transactions_df['Price']=transactions_df['Price_x']
transactions_df=transactions_df.drop(columns=['Price_y','Price_x'],axis=True)

# view merged data
transactions_df.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Region,ProductName,Category,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,Europe,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,Asia,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,Europe,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,South America,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,Europe,ComfortLiving Bluetooth Speaker,Electronics,300.68


### Feature Engineering

* One-Hot Encoding for product category 

In [82]:
# Create a pivot table with customer-product interaction
#customer_product_interaction = transactions_df.groupby(['CustomerID', 'Category']).size().unstack(fill_value=0)
customer_product_matrix = transactions_df.pivot_table(index='CustomerID',columns='ProductID',values='Quantity',fill_value=0)
# Standardize the features 
scaler = StandardScaler()
scaled_df = pd.DataFrame(scaler.fit_transform(customer_product_matrix),
                                                    columns=customer_product_matrix.columns,
                                                    index=customer_product_matrix.index)

# Preview the customer-product interaction
scaled_df.head()


ProductID,P001,P002,P003,P004,P005,P006,P007,P008,P009,P010,...,P091,P092,P093,P094,P095,P096,P097,P098,P099,P100
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0001,-0.191554,-0.198313,-0.234031,-0.173045,-0.201759,-0.246148,-0.189473,-0.220871,-0.172791,-0.22536,...,-0.180071,-0.20038,-0.178562,-0.217643,-0.178108,2.779355,-0.200844,-0.173544,-0.12926,-0.147464
C0002,-0.191554,-0.198313,-0.234031,7.479386,-0.201759,-0.246148,-0.189473,-0.220871,-0.172791,-0.22536,...,-0.180071,-0.20038,-0.178562,-0.217643,3.552778,-0.25961,-0.200844,-0.173544,-0.12926,-0.147464
C0003,-0.191554,6.115967,-0.234031,-0.173045,-0.201759,3.835813,-0.189473,-0.220871,-0.172791,-0.22536,...,-0.180071,-0.20038,-0.178562,-0.217643,-0.178108,-0.25961,-0.200844,-0.173544,-0.12926,-0.147464
C0004,-0.191554,-0.198313,-0.234031,-0.173045,-0.201759,-0.246148,-0.189473,4.405786,-0.172791,-0.22536,...,-0.180071,-0.20038,-0.178562,-0.217643,-0.178108,-0.25961,4.595304,-0.173544,-0.12926,-0.147464
C0005,-0.191554,-0.198313,-0.234031,-0.173045,-0.201759,-0.246148,-0.189473,-0.220871,-0.172791,-0.22536,...,-0.180071,-0.20038,-0.178562,-0.217643,-0.178108,-0.25961,-0.200844,-0.173544,-0.12926,-0.147464


* Compute cosine similarity

In [83]:

# Calculate cosine similarity
similarity = cosine_similarity(scaled_df)

* Create top 3 lookalikes with there similarity scores for the first 20 customers

In [88]:
# Create a dictionary to store lookalike customers
lookalikes = {}

# Find top 3 lookalikes for the first 20 customers
for i, customer_id in enumerate(customer_product_matrix.index[:20]):
    similar_customers = similarity[i].argsort()[::-1][1:4]  # Exclude self (index 0)
    lookalikes[customer_id] = []
    for j in similar_customers:
        lookalikes[customer_id].append((customer_product_matrix.index[j],similarity[i][j]))

# Create a DataFrame for the lookalikes

lookalike_df = pd.DataFrame.from_dict(lookalikes, orient='index')
lookalike_df.columns = ['LookalikeCustomerID_1', 'LookalikeCustomerID_2','LookalikeCustomerID_3']  
lookalike_df.index.name = 'Target CustomerID'

# Save the results to a CSV file
lookalike_df.to_csv('Soumya_H_Lookalike.csv')

print("Lookalike customers with similarity scores:")
lookalike_df

Lookalike customers with similarity scores:


Unnamed: 0_level_0,LookalikeCustomerID_1,LookalikeCustomerID_2,LookalikeCustomerID_3
Target CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C0001,"(C0194, 0.403396266993008)","(C0020, 0.3653990212204828)","(C0104, 0.34248752149926986)"
C0002,"(C0091, 0.4340902480599487)","(C0030, 0.40365337637928855)","(C0071, 0.32063620965199896)"
C0003,"(C0181, 0.47469321761270017)","(C0134, 0.46854469246418057)","(C0144, 0.4080853545633678)"
C0004,"(C0070, 0.38358072245428865)","(C0175, 0.3071395013311464)","(C0105, 0.26965547925167827)"
C0005,"(C0096, 0.48776309944956087)","(C0023, 0.47056277311097094)","(C0055, 0.3780574021622297)"
C0006,"(C0040, 0.4571832502984092)","(C0196, 0.3841944645035154)","(C0058, 0.37537113954618717)"
C0007,"(C0079, 0.6176418329255738)","(C0118, 0.47168438980984445)","(C0020, 0.4571595870720856)"
C0008,"(C0144, 0.3055166514172029)","(C0028, 0.28311706080516774)","(C0165, 0.26642269454273465)"
C0009,"(C0140, 0.5265061974196308)","(C0083, 0.4931338776850121)","(C0162, 0.47164927440704874)"
C0010,"(C0094, 0.486442319457847)","(C0143, 0.37643820912759507)","(C0092, 0.35560901675285933)"
