In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime

In [14]:
products=pd.read_csv('/content/drive/MyDrive/zeotap assignment/Products.csv')
customers=pd.read_csv('/content/drive/MyDrive/zeotap assignment/Customers.csv')
transactions=pd.read_csv('/content/drive/MyDrive/zeotap assignment/Transactions.csv')

In [15]:
transactions = pd.merge(transactions, products, on='ProductID')

In [25]:
avg_spending = transactions.groupby('CustomerID')['TotalValue'].mean().reset_index()
avg_spending.columns = ['CustomerID', 'AvgSpending']
customer_features = pd.merge(customers, avg_spending, on='CustomerID', how='left')

In [26]:
spending = transactions.groupby('CustomerID')['TotalValue'].sum().reset_index()
spending.columns = ['CustomerID', 'Spending']
customer_features = pd.merge(customer_features, spending, on='CustomerID', how='left')

In [27]:
most_bought_category = transactions.groupby(['CustomerID', 'Category']).size().reset_index(name='Count')
most_bought_category = most_bought_category.loc[most_bought_category.groupby('CustomerID')['Count'].idxmax()]
most_bought_category = most_bought_category[['CustomerID', 'Category']]
most_bought_category.columns = ['CustomerID', 'MostBoughtCat']
customer_features = pd.merge(customer_features, most_bought_category, on='CustomerID', how='left')

In [28]:
customer_features

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate,AvgSpending,Spending,MostBoughtCat
0,C0001,Lawrence Carroll,South America,2022-07-10,670.904000,3354.52,Electronics
1,C0002,Elizabeth Lutz,Asia,2022-02-13,465.685000,1862.74,Clothing
2,C0003,Michael Rivera,South America,2024-03-07,681.345000,2725.38,Home Decor
3,C0004,Kathleen Rodriguez,South America,2022-10-09,669.360000,5354.88,Books
4,C0005,Laura Weber,Asia,2022-08-15,678.080000,2034.24,Electronics
...,...,...,...,...,...,...,...
195,C0196,Laura Watts,Europe,2022-06-07,1245.720000,4982.88,Home Decor
196,C0197,Christina Harvey,Europe,2023-03-21,642.883333,1928.65,Electronics
197,C0198,Rebecca Ray,Europe,2022-02-27,465.915000,931.83,Clothing
198,C0199,Andrea Jenkins,Europe,2022-12-03,494.820000,1979.28,Electronics


In [29]:
customer_features.fillna(0, inplace=True)

In [31]:
customer_features = pd.get_dummies(customer_features, columns=['MostBoughtCat', 'Region'], drop_first=True)


In [33]:
scaler = StandardScaler()
numerical_features = ['AvgSpending', 'Spending']
customer_features[numerical_features] = scaler.fit_transform(customer_features[numerical_features])

In [34]:
customer_features.set_index('CustomerID', inplace=True)
customer_features.drop(columns=['CustomerName', 'SignupDate'], inplace=True)

In [35]:
similarity_matrix = cosine_similarity(customer_features)

In [36]:
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features.index, columns=customer_features.index)

In [47]:
def get_top_similar_customers(customer_id, similarity_df, top_n=3):
    # Exclude the customer itself (similarity score = 1)
    similar_customers = similarity_df[customer_id].drop(customer_id).sort_values(ascending=False).head(top_n)
    return list(zip(similar_customers.index, similar_customers.values))


In [48]:
lookalike_results = {}
for customer_id in customer_features.index[:20]:
    lookalike_results[customer_id] = get_top_similar_customers(customer_id, similarity_df)

lookalike_df = pd.DataFrame.from_dict(lookalike_results, orient='index', columns=['Lookalike1', 'Lookalike2', 'Lookalike3'])


In [49]:
lookalike_df

Unnamed: 0,Lookalike1,Lookalike2,Lookalike3
C0001,"(C0190, 0.9687859885110042)","(C0181, 0.9508502793850823)","(C0048, 0.9429693455883603)"
C0002,"(C0088, 0.9909935965023925)","(C0134, 0.9610049060745005)","(C0106, 0.8990914864519157)"
C0003,"(C0052, 0.9837077693404185)","(C0152, 0.9695870377873116)","(C0031, 0.8842150157447252)"
C0004,"(C0155, 0.9736203742766467)","(C0165, 0.954676278349485)","(C0169, 0.9471664549206747)"
C0005,"(C0146, 0.9693604545072987)","(C0186, 0.9608486635157301)","(C0130, 0.8204288583639459)"
C0006,"(C0168, 0.9742038524482126)","(C0171, 0.9728366380385514)","(C0187, 0.9663392853927287)"
C0007,"(C0140, 0.9724210075734376)","(C0115, 0.90622826330719)","(C0020, 0.8110113077529856)"
C0008,"(C0038, 0.846103574489786)","(C0189, 0.8446410072492467)","(C0160, 0.8434158605722757)"
C0009,"(C0010, 0.98080058465479)","(C0111, 0.9780572917364648)","(C0198, 0.9695360283616481)"
C0010,"(C0111, 0.9950996476027949)","(C0009, 0.98080058465479)","(C0198, 0.9794764919294137)"


In [51]:
lookalike_df.to_csv('Pranav_Pillai_Lookalike.csv', index=False)