In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict

In [2]:
# Load the datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [3]:
# Merge the data to associate transactions with customer and product information
merged_data = pd.merge(transactions, customers, on='CustomerID', how='left')
merged_data = pd.merge(merged_data, products, on='ProductID', how='left')

In [4]:
# Feature Engineering

# 1. Create customer-level transaction features
customer_features = merged_data.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),
    num_transactions=('TransactionID', 'nunique'),
    num_products_bought=('ProductID', 'nunique'),
    avg_transaction_value=('TotalValue', 'mean')
).reset_index()


In [5]:
# 2. Add customer profile features (Region, SignupDate)
customer_profile = customers[['CustomerID', 'Region', 'SignupDate']]

# Convert SignupDate to number of days since signup
customer_profile['SignupDate'] = pd.to_datetime(customer_profile['SignupDate'])
customer_profile['days_since_signup'] = (pd.to_datetime('today') - customer_profile['SignupDate']).dt.days

# Merge profile features with transaction features
customer_data = pd.merge(customer_features, customer_profile, on='CustomerID', how='left')
# 3. Normalize numerical features
scaler = StandardScaler()
numerical_features = ['total_spent', 'num_transactions', 'num_products_bought', 'avg_transaction_value', 'days_since_signup']
customer_data[numerical_features] = scaler.fit_transform(customer_data[numerical_features])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  customer_profile['SignupDate'] = pd.to_datetime(customer_profile['SignupDate'])


In [6]:
# 3. Normalize numerical features
scaler = StandardScaler()
numerical_features = ['total_spent', 'num_transactions', 'num_products_bought', 'avg_transaction_value', 'days_since_signup']
customer_data[numerical_features] = scaler.fit_transform(customer_data[numerical_features])

In [7]:
# 4. Prepare the data for similarity calculation
customer_data.set_index('CustomerID', inplace=True)

In [8]:
# 5. Compute cosine similarity between customers
similarity_matrix = cosine_similarity(customer_data[numerical_features])

In [9]:
# 6. Create a dictionary to store the top 3 similar customers for each of the first 20 customers
lookalikes = defaultdict(list)

In [10]:
for i in range(20):  # First 20 customers (C0001 to C0020)
    customer_id = f'C{i+1:04}'
    similarities = similarity_matrix[i]
    
    # Get the similarity scores and sort by highest similarity (excluding self)
    similar_customers = sorted([(customer_data.index[j], similarities[j]) for j in range(len(similarities)) if customer_data.index[j] != customer_id], key=lambda x: x[1], reverse=True)
    
    # Select top 3 similar customers
    lookalikes[customer_id] = [(customer, round(score, 4)) for customer, score in similar_customers[:3]]


In [11]:
# Convert the dictionary to the required format
lookalike_data = [
    {
        'CustomerID': cust_id,
        'Lookalikes': str(lookalikes_list)
    }
    for cust_id, lookalikes_list in lookalikes.items()
]

# Create a DataFrame and save to Lookalike.csv
lookalike_df = pd.DataFrame(lookalike_data)
lookalike_df.to_csv("Reuben_Joseph_Lookalike.csv", index=False)

# Show the first few rows of the lookalike data
print(lookalike_df.head())

print("Lookalike Model executed successfully. Results saved to Lookalike.csv.")

  CustomerID                                         Lookalikes
0      C0001  [('C0152', 0.9993), ('C0160', 0.9647), ('C0134...
1      C0002  [('C0029', 0.9957), ('C0192', 0.9803), ('C0025...
2      C0003  [('C0036', 0.9823), ('C0177', 0.9759), ('C0144...
3      C0004  [('C0175', 0.998), ('C0173', 0.9916), ('C0108'...
4      C0005  [('C0073', 0.9998), ('C0159', 0.9994), ('C0112...
Lookalike Model executed successfully. Results saved to Lookalike.csv.
