In [30]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [13]:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [14]:
merged_data = transactions.merge(customers, on='CustomerID', how='left').merge(products, on='ProductID', how='left')
print("Merged Dataset Sample", merged_data.head())

Merged Dataset Sample   TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue  Price_x     CustomerName         Region  SignupDate  \
0      300.68   300.68   Andrea Jenkins         Europe  2022-12-03   
1      300.68   300.68  Brittany Harvey           Asia  2024-09-04   
2      300.68   300.68  Kathryn Stevens         Europe  2024-04-04   
3      601.36   300.68  Travis Campbell  South America  2024-04-11   
4      902.04   300.68    Timothy Perez         Europe  2022-03-15   

                       ProductName     Category  Price_y  
0  ComfortLiving Bluetooth Speaker  Electronics   300.68  
1  ComfortLi

In [15]:
temp = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'TransactionID': 'count',
    'Quantity': 'sum',
    'Category': lambda x: x.mode().iloc[0],
    'Region': 'first'
}).reset_index()

In [16]:
temp.rename(columns={
    'TotalValue': 'TotalSpending',
    'TransactionID': 'TransactionCount',
    'Quantity': 'TotalQuantity',
    'Category': 'FrequentCategory'
}, inplace=True)

In [17]:
temp.head()

Unnamed: 0,CustomerID,TotalSpending,TransactionCount,TotalQuantity,FrequentCategory,Region
0,C0001,3354.52,5,12,Electronics,South America
1,C0002,1862.74,4,10,Clothing,Asia
2,C0003,2725.38,4,14,Home Decor,South America
3,C0004,5354.88,8,23,Books,South America
4,C0005,2034.24,3,7,Electronics,Asia


Label Encoding

In [21]:
encoder = LabelEncoder()

temp['Region'] = encoder.fit_transform(temp['Region'])
temp['FrequentCategory'] = encoder.fit_transform(temp['FrequentCategory'])

Normalize the Numerical Features

In [22]:
scaler = StandardScaler()
numeric_features = ['TotalSpending', 'TransactionCount', 'TotalQuantity']
temp[numeric_features] = scaler.fit_transform(temp[numeric_features])

Cosine Similarity

In [23]:
features_matrix = temp.drop(columns=['CustomerID']).values
cosine_sim_matrix = cosine_similarity(features_matrix)

In [24]:
similarity_scores = pd.DataFrame(
    cosine_sim_matrix,
    index= temp['CustomerID'],
    columns= temp['CustomerID']
)

In [26]:
def solve(cust_df, sim_scores, top_n=3):
    ans = {}

    for cust_id in cust_df['CustomerID']:
        scores = sim_scores.loc[cust_id]
        scores = scores[scores.index != cust_id]

        highest_sim = scores.nlargest(top_n).items()
        ans[cust_id] = []

        for sim_cust_id, score in highest_sim:
            rounded_score = round(score, 2)
            ans_tuple = (sim_cust_id, rounded_score)
            ans[cust_id].append(ans_tuple)

    return ans


In [27]:
cust_list = customers[customers['CustomerID'].str.startswith('C00')].head(20)
customer_recommendations = solve(cust_list, similarity_scores)

In [28]:
output_data = {'cust_id': [], 'lookalikes': []}
for cust_id, recs in customer_recommendations.items():
    output_data['cust_id'].append(cust_id)
    output_data['lookalikes'].append(recs)

output_df = pd.DataFrame(output_data)
output_df.to_csv('Rishav_Jain_Lookalike.csv', index=False)

print("Recommendations successfully saved to 'Rishav_Jain_Lookalike.csv'")

Recommendations successfully saved to 'Rishav_Jain_Lookalike.csv'


In [29]:
output = pd.read_csv("Rishav_Jain_Lookalike.csv")
output.head()

Unnamed: 0,cust_id,lookalikes
0,C0001,"[('C0048', 1.0), ('C0190', 0.99), ('C0183', 0...."
1,C0002,"[('C0097', 0.97), ('C0128', 0.97), ('C0186', 0..."
2,C0003,"[('C0072', 0.99), ('C0055', 0.99), ('C0026', 0..."
3,C0004,"[('C0057', 0.98), ('C0122', 0.98), ('C0046', 0..."
4,C0005,"[('C0186', 1.0), ('C0128', 1.0), ('C0080', 0.99)]"
