In [2]:
# Task 2: Lookalike Model

import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load the data
customers = pd.read_csv('../data/Customers.csv')
transactions = pd.read_csv('../data/Transactions.csv')
products = pd.read_csv('../data/Products.csv')

# Merge transaction data with customer data
merged_data = transactions.merge(customers, on='CustomerID')

# Create a feature for each customer: total spending, average spending, most frequent category
customer_summary = merged_data.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),
    avg_spent=('TotalValue', 'mean'),
    num_purchases=('TransactionID', 'count')
).reset_index()

# Normalize the data
customer_summary['total_spent'] = (customer_summary['total_spent'] - customer_summary['total_spent'].mean()) / customer_summary['total_spent'].std()
customer_summary['avg_spent'] = (customer_summary['avg_spent'] - customer_summary['avg_spent'].mean()) / customer_summary['avg_spent'].std()
customer_summary['num_purchases'] = (customer_summary['num_purchases'] - customer_summary['num_purchases'].mean()) / customer_summary['num_purchases'].std()

# Create similarity matrix based on cosine similarity
cosine_sim = cosine_similarity(customer_summary[['total_spent', 'avg_spent', 'num_purchases']])

# Function to get top 3 lookalike customers
def get_lookalikes(customer_id, top_n=3):
    customer_idx = customer_summary[customer_summary['CustomerID'] == customer_id].index[0]
    sim_scores = list(enumerate(cosine_sim[customer_idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    top_similar = []
    for idx, score in sim_scores[1:top_n+1]:
        top_similar.append((customer_summary.iloc[idx]['CustomerID'], score))
    
    return top_similar

# Get the top 3 lookalike customers for CustomerID 'C0001' to 'C0020'
lookalikes = {}
for customer_id in customer_summary['CustomerID'][:20]:
    lookalikes[customer_id] = get_lookalikes(customer_id)

# Save lookalikes to CSV
lookalike_df = pd.DataFrame(lookalikes).T
lookalike_df.to_csv('../outputs/Task2_Lookalike.csv', header=False)

lookalike_df.head()  # Displaying the first 5 lookalikes



Unnamed: 0,0,1,2
C0001,"(C0137, 0.9993600788417095)","(C0152, 0.9956575062125337)","(C0121, 0.9930123335059389)"
C0002,"(C0029, 0.9996379596731121)","(C0199, 0.9988672178177447)","(C0010, 0.9988313958473667)"
C0003,"(C0005, 0.9998942821541472)","(C0178, 0.9995651300881158)","(C0144, 0.9992167799811654)"
C0004,"(C0067, 0.9999912751252354)","(C0021, 0.9996580546333909)","(C0075, 0.9992876794675158)"
C0005,"(C0003, 0.9998942821541472)","(C0073, 0.9994945201465776)","(C0063, 0.9992593273871893)"
