In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler


In [2]:
customers = pd.read_csv("customers.csv")
products = pd.read_csv("products.csv")
transactions = pd.read_csv("transactions.csv")

In [3]:
print("Customers Data:")
display(customers.head())
print("\nProducts Data:")
display(products.head())
print("\nTransactions Data:")
display(transactions.head())

Customers Data:


Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15



Products Data:


Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.3
1,P002,ActiveWear Smartwatch,Electronics,346.3
2,P003,ComfortLiving Biography,Books,44.12
3,P004,BookWorld Rug,Home Decor,95.69
4,P005,TechPro T-Shirt,Clothing,429.31



Transactions Data:


Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68


In [4]:
# Merge datasets
merged_data = transactions.merge(customers, on='CustomerID', how='left') \
                           .merge(products, on='ProductID', how='left')

# Display merged dataset
print("\nMerged Dataset:")
display(merged_data.head())


Merged Dataset:


Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68


In [5]:
# Feature Engineering
# Transaction frequency
customer_transaction_count = merged_data.groupby('CustomerID')['TransactionID'].count().rename('TransactionFrequency')


In [6]:
# Average transaction value
customer_avg_value = merged_data.groupby('CustomerID')['TotalValue'].mean().rename('AvgTransactionValue')

In [7]:
# Favorite product category
customer_favorite_category = merged_data.groupby('CustomerID')['Category'] \
                                        .agg(lambda x: x.mode().iloc[0]) \
                                        .rename('FavoriteCategory')

In [8]:
# Combine all features into a single DataFrame
customer_features = pd.concat([customer_transaction_count, customer_avg_value, customer_favorite_category], axis=1)

In [9]:
# One-hot encode favorite category
customer_features = pd.get_dummies(customer_features, columns=['FavoriteCategory'])

In [10]:
# Standardize numeric features
scaler = StandardScaler()
numeric_features = ['TransactionFrequency', 'AvgTransactionValue']
customer_features[numeric_features] = scaler.fit_transform(customer_features[numeric_features])

In [11]:
# Compute similarity
similarity_matrix = cosine_similarity(customer_features)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features.index, columns=customer_features.index)

In [12]:
# Generate Lookalike Recommendations
lookalike_results = {}
for customer_id in customer_features.index[:20]:  # For customers C0001 to C0020
    similar_customers = similarity_df[customer_id].sort_values(ascending=False)[1:4]  # Top 3 similar customers
    lookalike_results[customer_id] = [(similar_customer, score) for similar_customer, score in similar_customers.items()]

In [13]:
lookalike_df = pd.DataFrame({
    "CustomerID": list(lookalike_results.keys()),
    "Recommendations": [str(v) for v in lookalike_results.values()]
})

In [14]:
# Save Lookalike.csv
lookalike_df.to_csv("FirstName_LastName_Lookalike.csv", index=False)

print("\nLookalike recommendations saved to 'FirstName_LastName_Lookalike.csv'.")


Lookalike recommendations saved to 'FirstName_LastName_Lookalike.csv'.
