In [31]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


In [32]:
# Load datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")


In [33]:
# Merge datasets for comprehensive analysis
merged_data = transactions.merge(customers, on="CustomerID", how="left")
merged_data = data.merge(products, on="ProductID", how="left")

data.columns


Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price_x', 'CustomerName', 'Region',
       'SignupDate', 'ProductName', 'Category', 'Price_y'],
      dtype='object')

In [34]:
# Feature Engineering
# Calculate total spending and transaction frequency for each customer
customer_profile = merged_data.groupby('CustomerID').agg(
    TotalSpending=('TotalValue', 'sum'),
    TransactionCount=('TransactionID', 'nunique'),
    AverageSpending=('TotalValue', 'mean')
).reset_index()

In [35]:
# Add customer name and region as features
customer_profile = customer_profile.merge(customers[['CustomerID', 'CustomerName', 'Region']], on='CustomerID', how='left')


In [36]:
# Add region as a categorical feature
region_dummies = pd.get_dummies(customers[['CustomerID', 'Region']], columns=['Region'], drop_first=True)
customer_profile = customer_profile.merge(region_dummies, on='CustomerID', how='left')


In [37]:
# Normalize numerical features
scaler = StandardScaler()
numeric_features = ['TotalSpending', 'TransactionCount', 'AverageSpending']
customer_profile[numeric_features] = scaler.fit_transform(customer_profile[numeric_features])


In [38]:
# Compute cosine similarity between customers
similarity_matrix = cosine_similarity(customer_profile[numeric_features])

# Create a DataFrame for similarity scores
similarity_df = pd.DataFrame(similarity_matrix, index=customer_profile['CustomerID'], columns=customer_profile['CustomerID'])


In [39]:
# Recommend the top 3 most similar customers for the first 20 customers
lookalike_results = []
for customer_id in customer_profile['CustomerID'][:20]:
    similar_customers = similarity_df[customer_id].sort_values(ascending=False).iloc[1:4]
    for similar_id, score in zip(similar_customers.index, similar_customers.values):
        original_name = customer_profile[customer_profile['CustomerID'] == customer_id]['CustomerName'].values[0]
        similar_name = customer_profile[customer_profile['CustomerID'] == similar_id]['CustomerName'].values[0]
        lookalike_results.append({
            "cust_id": customer_id,
            "cust_name": original_name,
            "lookalike_id": similar_id,
            "lookalike_name": similar_name,
            "similarity_score": score
        })


In [41]:
# Save the results to a CSV file
lookalike_df = pd.DataFrame(lookalike_results)
lookalike_df.to_csv('FirstName_LastName_Lookalike.csv', index=False)

# Display the lookalike recommendations
print(lookalike_df.head())

  cust_id         cust_name lookalike_id  lookalike_name  similarity_score
0   C0001  Lawrence Carroll        C0137  Robert Gardner          0.999360
1   C0001  Lawrence Carroll        C0152    Justin Evans          0.995658
2   C0001  Lawrence Carroll        C0121   Mark Atkinson          0.993012
3   C0002    Elizabeth Lutz        C0029    Erin Manning          0.999638
4   C0002    Elizabeth Lutz        C0199  Andrea Jenkins          0.998867
