In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Load datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Merge datasets
merged_df = transactions.merge(customers, on='CustomerID', how='left')
merged_df = merged_df.merge(products, on='ProductID', how='left')

# Rename and clean up columns
merged_df.rename(columns={'Price_y': 'Price'}, inplace=True)
merged_df.drop(columns=['Price_x'], inplace=True)

# Aggregate transaction data per customer
customer_profile = merged_df.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'Price': 'mean',
    'Category': lambda x: x.mode()[0] if not x.mode().empty else 'Unknown',  # Handle empty mode
    'Region': lambda x: x.mode()[0] if not x.mode().empty else 'Unknown'     # Handle empty mode
}).reset_index()

# Convert categorical columns to numeric using one-hot encoding
customer_profile = pd.get_dummies(customer_profile, columns=['Category', 'Region'])

# Normalize data
scaler = StandardScaler()
customer_features = scaler.fit_transform(customer_profile.drop(columns=['CustomerID']))

# Compute similarity scores using cosine similarity
similarity_matrix = cosine_similarity(customer_features)

# Get top 3 similar customers for the first 20 customers
lookalikes = {}
customer_ids = customer_profile['CustomerID'].values[:20]

for i, cust_id in enumerate(customer_ids):
    scores = list(enumerate(similarity_matrix[i]))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    top_3 = [(customer_profile['CustomerID'][idx], round(score, 4)) for idx, score in scores[1:4]]
    lookalikes[cust_id] = top_3

# Create the output DataFrame
lookalike_df = pd.DataFrame.from_dict(lookalikes, orient='index', columns=['Lookalike1', 'Lookalike2', 'Lookalike3'])

# Save the results to a CSV file
lookalike_df.to_csv('Lookalike.csv', index=True, header=True)

# Display the results
print(lookalike_df.head())


            Lookalike1       Lookalike2       Lookalike3
C0001  (C0181, 0.9826)  (C0120, 0.9674)  (C0184, 0.9556)
C0002  (C0088, 0.9946)  (C0106, 0.9644)  (C0134, 0.9333)
C0003  (C0031, 0.9525)  (C0052, 0.9473)  (C0195, 0.9438)
C0004  (C0165, 0.9639)  (C0169, 0.9554)  (C0087, 0.9488)
C0005  (C0140, 0.9954)  (C0186, 0.9925)  (C0146, 0.9752)
