## Lookalike Model

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

In [2]:
customers = pd.read_csv('/content/Customers.csv')
products = pd.read_csv('/content/Products.csv')
transactions = pd.read_csv('/content/Transactions.csv')

In [3]:

data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

In [4]:
# Create customer profiles
# Aggregate features
customer_profiles = data.groupby('CustomerID').agg({
    'TotalValue': 'sum',  # Total spend
    'Category': lambda x: ' '.join(x),  # Product categories
    'TransactionID': 'count'  # Frequency of transactions
}).reset_index()

In [5]:
customer_profiles

Unnamed: 0,CustomerID,TotalValue,Category,TransactionID
0,C0001,3354.52,Books Home Decor Electronics Electronics Elect...,5
1,C0002,1862.74,Home Decor Home Decor Clothing Clothing,4
2,C0003,2725.38,Home Decor Home Decor Clothing Electronics,4
3,C0004,5354.88,Books Home Decor Home Decor Home Decor Books B...,8
4,C0005,2034.24,Home Decor Electronics Electronics,3
...,...,...,...,...
194,C0196,4982.88,Books Clothing Home Decor Home Decor,4
195,C0197,1928.65,Home Decor Electronics Electronics,3
196,C0198,931.83,Electronics Clothing,2
197,C0199,1979.28,Electronics Home Decor Home Decor Electronics,4


In [6]:
customer_profiles.to_csv('/content/customer_pofiles.csv', index=False)

In [7]:
customer_profiles

Unnamed: 0,CustomerID,TotalValue,Category,TransactionID
0,C0001,3354.52,Books Home Decor Electronics Electronics Elect...,5
1,C0002,1862.74,Home Decor Home Decor Clothing Clothing,4
2,C0003,2725.38,Home Decor Home Decor Clothing Electronics,4
3,C0004,5354.88,Books Home Decor Home Decor Home Decor Books B...,8
4,C0005,2034.24,Home Decor Electronics Electronics,3
...,...,...,...,...
194,C0196,4982.88,Books Clothing Home Decor Home Decor,4
195,C0197,1928.65,Home Decor Electronics Electronics,3
196,C0198,931.83,Electronics Clothing,2
197,C0199,1979.28,Electronics Home Decor Home Decor Electronics,4


In [8]:
# Feature engineering (One-hot encode categories)
category_encoded = customer_profiles['Category'].str.get_dummies(' ')
customer_profiles = pd.concat([customer_profiles, category_encoded], axis=1)
customer_profiles.drop(columns=['Category'], inplace=True)

In [9]:
# Normalize features
scaler = StandardScaler()
features = scaler.fit_transform(customer_profiles.drop(columns=['CustomerID']))

In [10]:
# Compute similarity scores
similarity_matrix = cosine_similarity(features)
similarity_matrix

array([[ 1.        , -0.37410717, -0.14587591, ..., -0.5071157 ,
         0.46160478,  0.17943812],
       [-0.37410717,  1.        ,  0.64582876, ...,  0.27161223,
         0.40862251, -0.27463717],
       [-0.14587591,  0.64582876,  1.        , ...,  0.41912323,
         0.6513717 ,  0.13148628],
       ...,
       [-0.5071157 ,  0.27161223,  0.41912323, ...,  1.        ,
         0.26470747, -0.53764899],
       [ 0.46160478,  0.40862251,  0.6513717 , ...,  0.26470747,
         1.        , -0.2837893 ],
       [ 0.17943812, -0.27463717,  0.13148628, ..., -0.53764899,
        -0.2837893 ,  1.        ]])

In [11]:
# Map customer IDs to indices
customer_ids = customer_profiles['CustomerID'].tolist()
customer_index_map = {cid: idx for idx, cid in enumerate(customer_ids)}

In [12]:
# Find top 3 lookalikes for first 20 customers
lookalikes = {}
for target_customer in customer_ids[:20]:
    target_idx = customer_index_map[target_customer]
    similarities = list(enumerate(similarity_matrix[target_idx]))
    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)

    # Get top 3 lookalikes (excluding the customer itself)
    top_3 = [(customer_ids[idx], score) for idx, score in similarities[1:4]]
    lookalikes[target_customer] = top_3

In [13]:

lookalike_data = []
for cust_id, top_3 in lookalikes.items():
    for sim_cust, score in top_3:
        lookalike_data.append([cust_id, sim_cust, score])

lookalike_df = pd.DataFrame(lookalike_data, columns=['CustomerID', 'SimilarCustomerID', 'SimilarityScore'])
lookalike_df.to_csv('/content/Lookalike.csv', index=False)

print("Lookalike model completed! Results saved to Lookalike.csv.")

Lookalike model completed! Results saved to Lookalike.csv.
