### Lookalike Model

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# Load clean datasets
data = pd.read_csv('data.csv')
data.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price,CustomerName,Region,SignupDate,ProductName,Category
0,T00001,C0199,P067,2024-08-25 12:38:23,1,5.709367,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics
1,T00112,C0146,P067,2024-05-27 22:23:54,1,5.709367,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics
2,T00166,C0127,P067,2024-04-25 07:38:55,1,5.709367,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics
3,T00272,C0087,P067,2024-03-26 22:55:37,2,6.400855,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics
4,T00363,C0070,P067,2024-03-21 15:10:10,3,6.805767,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics


In [5]:
# Feature engineering
customer_features = data.groupby('CustomerID').agg({
    'TotalValue': 'sum',  # Total spending
    'Quantity': 'sum',    # Total quantity purchased
    'Price': 'mean'       # Average price of purchased products
}).reset_index()

In [7]:
# Normalize features
scaler = StandardScaler()
customer_features_scaled = scaler.fit_transform(customer_features.iloc[:, 1:])

In [9]:
# Compute cosine similarity
similarities = cosine_similarity(customer_features_scaled)

In [11]:
# Create a mapping for top 3 similar customers
lookalike_map = {}
for i, customer_id in enumerate(customer_features['CustomerID']):
    similar_customers = sorted(
        list(enumerate(similarities[i])), key=lambda x: x[1], reverse=True
    )[1:4]  # Skip self-similarity
    lookalike_map[customer_id] = [(customer_features['CustomerID'][x[0]], x[1]) for x in similar_customers]

# Save to Lookalike.csv
lookalike_df = pd.DataFrame([
    {'CustomerID': k, 'Recommended_Customers': '|'.join([c[0] for c in v]), 'Score': '|'.join([str(c[1]) for c in v])}
    for k, v in lookalike_map.items()
])
lookalike_df.to_csv('Smrutirekha_Swain_Lookalike.csv', index=False)

In [19]:
# Check the lookalikes for the first 20 customers
for customer_id in ['C0001', 'C0002', 'C0003', 'C0004', 'C0005']:
    customer_data = lookalike_df[lookalike_df['CustomerID'] == customer_id]
    print(customer_data)

  CustomerID Recommended_Customers  \
0      C0001     C0103|C0191|C0137   

                                               Score  
0  0.9940273156744326|0.9806803994606358|0.976951...  
  CustomerID Recommended_Customers  \
1      C0002     C0060|C0151|C0029   

                                               Score  
1  0.9995933709405626|0.9988527926089823|0.997841...  
  CustomerID Recommended_Customers  \
2      C0003     C0190|C0010|C0111   

                                               Score  
2  0.9711851065632908|0.970443695061573|0.9606588...  
  CustomerID Recommended_Customers  \
3      C0004     C0057|C0109|C0105   

                                               Score  
3  0.9997221753646898|0.9984632726518661|0.997859...  
  CustomerID Recommended_Customers  \
4      C0005     C0128|C0061|C0130   

                                               Score  
4  0.9996689431481038|0.9996381958512416|0.997009...  
