# Lookalike Model

In [2]:
!pip install pandas numpy matplotlib seaborn scikit-learn



In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

customers = pd.read_csv('./Customers.csv')
products = pd.read_csv('./Products.csv')
transactions = pd.read_csv('./Transactions.csv')

## Prepare the Data

In [5]:
merged_df = transactions.merge(customers, on='CustomerID', how='left')
merged_df = merged_df.merge(products, on='ProductID', how='left')

## feature set for each customer

In [6]:
customer_features = merged_df.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'TransactionID': 'count',
    'Category': lambda x: x.mode()[0]  # Most frequent category
}).rename(columns={
    'TotalValue': 'TotalSpending',
    'TransactionID': 'NumTransactions',
    'Category': 'PreferredCategory'
}).reset_index()
print(customer_features.head())

  CustomerID  TotalSpending  NumTransactions PreferredCategory
0      C0001        3354.52                5       Electronics
1      C0002        1862.74                4          Clothing
2      C0003        2725.38                4        Home Decor
3      C0004        5354.88                8             Books
4      C0005        2034.24                3       Electronics


## Encode Categorical Data

### Encode the PreferredCategory column using one-hot encoding

In [7]:
customer_features = pd.get_dummies(customer_features, columns=['PreferredCategory'])

## Calculate Similarity

### cosine similarity to find similar customers

In [8]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(customer_features.iloc[:, 1:])  # Exclude CustomerID
similarity_df = pd.DataFrame(
    similarity_matrix,
    index=customer_features['CustomerID'],
    columns=customer_features['CustomerID']
)
print(similarity_df.head())

CustomerID  C0001  C0002  C0003  C0004  C0005     C0006     C0007     C0008  \
CustomerID                                                                    
C0001         1.0    1.0    1.0    1.0    1.0  1.000000  1.000000  1.000000   
C0002         1.0    1.0    1.0    1.0    1.0  0.999999  0.999999  1.000000   
C0003         1.0    1.0    1.0    1.0    1.0  1.000000  1.000000  1.000000   
C0004         1.0    1.0    1.0    1.0    1.0  1.000000  1.000000  1.000000   
C0005         1.0    1.0    1.0    1.0    1.0  1.000000  1.000000  0.999999   

CustomerID     C0009     C0010  ...  C0191  C0192     C0193  C0194  C0195  \
CustomerID                      ...                                         
C0001       0.999998  0.999999  ...    1.0    1.0  1.000000    1.0    1.0   
C0002       0.999999  1.000000  ...    1.0    1.0  0.999999    1.0    1.0   
C0003       0.999998  0.999999  ...    1.0    1.0  1.000000    1.0    1.0   
C0004       0.999998  0.999999  ...    1.0    1.0  1.000000  

## Top 3 Similar Customers

### For each of the first 20 customers (C0001 to C0020)

In [9]:
lookalike_map = {}

for customer in customer_features['CustomerID'][:20]:
    similar_customers = similarity_df[customer].sort_values(ascending=False).iloc[1:4]
    lookalike_map[customer] = list(zip(similar_customers.index, similar_customers.values))

print(lookalike_map)

{'C0001': [('C0039', 0.9999999952285652), ('C0035', 0.9999999945678594), ('C0146', 0.999999993731339)], 'C0002': [('C0029', 0.9999999954613772), ('C0103', 0.9999999846113045), ('C0010', 0.9999999824949058)], 'C0003': [('C0178', 0.9999999999572835), ('C0152', 0.9999999973975879), ('C0159', 0.999999993115422)], 'C0004': [('C0021', 0.9999999998367862), ('C0075', 0.9999999988734848), ('C0101', 0.9999999985853775)], 'C0005': [('C0112', 0.9999999982426493), ('C0197', 0.9999999963783862), ('C0035', 0.9999999919482886)], 'C0006': [('C0117', 0.9999999988358368), ('C0171', 0.9999999987827094), ('C0044', 0.999999997180717)], 'C0007': [('C0120', 0.9999999985154543), ('C0026', 0.9999999937558456), ('C0140', 0.9999999845489194)], 'C0008': [('C0076', 0.9999999614936271), ('C0139', 0.9999999197663533), ('C0047', 0.9999999071628308)], 'C0009': [('C0077', 0.9999999689015278), ('C0049', 0.999999760463356), ('C0111', 0.9999994850024262)], 'C0010': [('C0029', 0.9999999957831188), ('C0088', 0.99999999343126

## Save the Lookalike Map results to a CSV file

In [12]:
import csv

with open('FirstName_LastName_Lookalike.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['CustomerID', 'Lookalikes'])
    for key, value in lookalike_map.items():
        writer.writerow([key, value])