In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

In [None]:
# Load the datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [None]:
# Merge datasets for comprehensive analysis
merged_data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

# -----------------------------------------------------------------
# Step 1: Data Preparation
# -----------------------------------------------------------------

In [None]:
# Aggregate transaction data for each customer
customer_transactions = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',                # Total spend
    'TransactionID': 'count',           # Total number of transactions
    'ProductName': lambda x: ','.join(x),  # List of products purchased
    'Category': lambda x: ','.join(x)  # List of categories purchased
}).reset_index()

customer_transactions.rename(columns={
    'TotalValue': 'TotalSpend',
    'TransactionID': 'TransactionCount'
}, inplace=True)

In [None]:
# Create a one-hot encoding for categories purchased
categories_one_hot = pd.get_dummies(merged_data[['CustomerID', 'Category']], columns=['Category'])
categories_agg = categories_one_hot.groupby('CustomerID').sum()


In [None]:
# Combine customer data with aggregated transaction data
customer_profiles = customers.merge(customer_transactions, on='CustomerID').merge(categories_agg, on='CustomerID')



# -----------------------------------------------------------------
# Step 2: Feature Engineering
# -----------------------------------------------------------------

In [None]:
 #Convert 'Region' into numerical format (e.g., one-hot encoding)
customer_profiles = pd.get_dummies(customer_profiles, columns=['Region'])

In [None]:
customer_profiles.head(2)

Unnamed: 0,CustomerID,CustomerName,SignupDate,TotalSpend,TransactionCount,ProductName,Category,Category_Books,Category_Clothing,Category_Electronics,Category_Home Decor,Region_Asia,Region_Europe,Region_North America,Region_South America
0,C0001,Lawrence Carroll,2022-07-10,3354.52,5,"SoundWave Cookbook,HomeSense Wall Art,SoundWav...","Books,Home Decor,Electronics,Electronics,Elect...",1,0,3,1,False,False,False,True
1,C0002,Elizabeth Lutz,2022-02-13,1862.74,4,"BookWorld Cookware Set,BookWorld Rug,ComfortLi...","Home Decor,Home Decor,Clothing,Clothing",0,2,0,2,True,False,False,False


In [None]:
# Features to include in similarity calculation
features = ['Region_Asia','Region_Europe','Region_North America','Region_South America', 'TotalSpend', 'TransactionCount'] + list(categories_agg.columns[1:])


In [None]:
# Normalize numerical features
scaler = MinMaxScaler()
customer_profiles[features] = scaler.fit_transform(customer_profiles[features])


In [None]:
# Create the feature matrix
feature_matrix = customer_profiles[features].set_index(customer_profiles['CustomerID'])

# -----------------------------------------------------------------
# Step 3: Similarity Calculation
# -----------------------------------------------------------------

In [None]:
# Calculate pairwise cosine similarity
similarity_matrix = cosine_similarity(feature_matrix)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_profiles['CustomerID'], columns=customer_profiles['CustomerID'])


# -----------------------------------------------------------------
# Step 4: Recommendation
# -----------------------------------------------------------------

In [None]:
# Get top 3 most similar customers for each of the first 20 customers
lookalike_map = {}
for customer_id in customers['CustomerID'][:20]:
    similar_customers = similarity_df.loc[customer_id].sort_values(ascending=False).iloc[1:4]  # Exclude self (highest similarity)
    lookalike_map[customer_id] = list(zip(similar_customers.index, similar_customers.values))

In [None]:
# Create Lookalike.csv
lookalike_df = pd.DataFrame({
    'CustomerID': lookalike_map.keys(),
    'Lookalikes': [str(val) for val in lookalike_map.values()]
})
lookalike_df.to_csv('Lookalike.csv', index=False)


# -----------------------------------------------------------------
# Step 5: Output Recommendations
# -----------------------------------------------------------------

In [None]:
# Display recommendations for the first 20 customers
for customer, lookalikes in lookalike_map.items():
    print(f"Customer {customer}:")
    for lookalike, score in lookalikes:
        print(f"   Similar Customer: {lookalike}, Similarity Score: {score:.2f}")

Customer C0001:
   Similar Customer: C0091, Similarity Score: 0.98
   Similar Customer: C0191, Similarity Score: 0.98
   Similar Customer: C0120, Similarity Score: 0.98
Customer C0002:
   Similar Customer: C0134, Similarity Score: 0.99
   Similar Customer: C0178, Similarity Score: 0.98
   Similar Customer: C0159, Similarity Score: 0.98
Customer C0003:
   Similar Customer: C0031, Similarity Score: 1.00
   Similar Customer: C0158, Similarity Score: 1.00
   Similar Customer: C0129, Similarity Score: 0.99
Customer C0004:
   Similar Customer: C0113, Similarity Score: 0.99
   Similar Customer: C0163, Similarity Score: 0.98
   Similar Customer: C0012, Similarity Score: 0.98
Customer C0005:
   Similar Customer: C0007, Similarity Score: 1.00
   Similar Customer: C0146, Similarity Score: 1.00
   Similar Customer: C0186, Similarity Score: 0.99
Customer C0006:
   Similar Customer: C0187, Similarity Score: 0.99
   Similar Customer: C0158, Similarity Score: 0.97
   Similar Customer: C0133, Similarit