<a href="https://colab.research.google.com/github/sai-1903/sai-1903/blob/main/Saikiran_santhpale_Lookalike.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the datasets
customers = pd.read_csv("/Customers.csv")
products = pd.read_csv("/Products.csv")
transactions = pd.read_csv("/Transactions.csv")


In [5]:
# Aggregate customer-level data
customer_data = transactions.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum'
}).rename(columns={'TotalValue': 'TotalSpend', 'Quantity': 'TotalQuantity'}).reset_index()


customer_profile = customers.merge(customer_data, on="CustomerID", how="left").fillna(0)
print(customer_profile.head())

  CustomerID        CustomerName         Region  SignupDate  TotalSpend  \
0      C0001    Lawrence Carroll  South America  2022-07-10     3354.52   
1      C0002      Elizabeth Lutz           Asia  2022-02-13     1862.74   
2      C0003      Michael Rivera  South America  2024-03-07     2725.38   
3      C0004  Kathleen Rodriguez  South America  2022-10-09     5354.88   
4      C0005         Laura Weber           Asia  2022-08-15     2034.24   

   TotalQuantity  
0           12.0  
1           10.0  
2           14.0  
3           23.0  
4            7.0  


In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity


features = customer_profile[['TotalSpend', 'TotalQuantity']]
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)


similarity_matrix = cosine_similarity(scaled_features)


lookalikes = {}
for idx, customer in enumerate(customer_profile['CustomerID']):
    scores = list(enumerate(similarity_matrix[idx]))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)[1:4]  # Top 3 lookalikes
    lookalikes[customer] = [(customer_profile.iloc[i[0]]['CustomerID'], round(i[1], 2)) for i in scores]

# Save to CSV
lookalikes_df = pd.DataFrame.from_dict(
    {k: [v[0][0], v[0][1], v[1][0], v[1][1], v[2][0], v[2][1]] for k, v in lookalikes.items()},
    orient='index',
    columns=['Lookalike1', 'Score1', 'Lookalike2', 'Score2', 'Lookalike3', 'Score3']
)
lookalikes_df.to_csv('Lookalike.csv', index_label='CustomerID')
print(lookalikes_df.head())

      Lookalike1  Score1 Lookalike2  Score2 Lookalike3  Score3
C0001      C0164     1.0      C0085    1.00      C0127    1.00
C0002      C0157     1.0      C0094    1.00      C0029    1.00
C0003      C0111     1.0      C0160    0.99      C0147    0.99
C0004      C0162     1.0      C0165    1.00      C0175    1.00
C0005      C0080     1.0      C0167    1.00      C0177    1.00
