In [1]:
# Import necessary libraries
import pandas as pd  
import numpy as np 
    
from sklearn.preprocessing import StandardScaler  
from sklearn.metrics.pairwise import cosine_similarity  

In [2]:
# Load the datasets
customers = pd.read_csv("C:\\Users\\pc\\OneDrive\\Desktop\\Customers.csv")
transactions = pd.read_csv("C:\\Users\\pc\\OneDrive\\Desktop\\Transactions.csv")
products = pd.read_csv("C:\\Users\\pc\\OneDrive\\Desktop\\Products.csv")

In [3]:
# Convert date columns to datetime format for easier analysis
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])  
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

In [4]:
# Merge transactions with customers on CustomerID
transactions_customers = transactions.merge(customers, on='CustomerID', how='left')

In [5]:
# Merge the above result with products on ProductID
df = transactions_customers.merge(products, on='ProductID', how='left')  

In [6]:
# Create one-hot encoded columns for customer regions
customer_region = pd.get_dummies(customers['Region'], prefix='Region') 

In [7]:
# Calculate tenure (number of days since signup) for each customer
current_date = transactions['TransactionDate'].max()  
customers['TenureDays'] = (current_date - customers['SignupDate']).dt.days 

In [8]:
# Calculate total spending for each customer
customer_spending = df.groupby('CustomerID')['TotalValue'].sum().reset_index()  
customer_spending.rename(columns={'TotalValue': 'TotalSpending'}, inplace=True)  

In [9]:
# Calculate average order value for each customer
customer_avg_order = df.groupby('CustomerID')['TotalValue'].mean().reset_index()  
customer_avg_order.rename(columns={'TotalValue': 'AvgOrderValue'}, inplace=True)  

In [10]:
# Calculate the number of transactions for each customer
customer_frequency = df.groupby('CustomerID')['TransactionID'].nunique().reset_index()  
customer_frequency.rename(columns={'TransactionID': 'NumTransactions'}, inplace=True)  

In [11]:
# Determine the favorite product category for each customer
favorite_category = df.groupby(['CustomerID', 'Category'])['Quantity'].sum().reset_index()  
favorite_category = favorite_category.loc[favorite_category.groupby('CustomerID')['Quantity'].idxmax()]  
favorite_category = favorite_category[['CustomerID', 'Category']]  

In [12]:
# One-hot encode the favorite category
favorite_category_encoded = pd.get_dummies(favorite_category['Category'], prefix='FavCat')  
favorite_category = pd.concat([favorite_category['CustomerID'], favorite_category_encoded], axis=1)  

In [13]:
# Calculate recency (number of days since the last purchase) for each customer
last_purchase = df.groupby('CustomerID')['TransactionDate'].max().reset_index()  
last_purchase['RecencyDays'] = (current_date - last_purchase['TransactionDate']).dt.days  
last_purchase = last_purchase[['CustomerID', 'RecencyDays']]  

In [14]:
# Start building the customer features dataframe
customer_features = customers[['CustomerID']]  

In [15]:
# Merge customer features with one-hot encoded regions
customer_features = customer_features.merge(customer_region, left_index=True, right_index=True)

In [16]:
# Merge customer features with tenure days
customer_features = customer_features.merge(customers[['CustomerID', 'TenureDays']], on='CustomerID', how='left') 

In [17]:
# Merge customer features with total spending, average order value, and number of transactions
customer_features = customer_features.merge(customer_spending, on='CustomerID', how='left')  
customer_features = customer_features.merge(customer_avg_order, on='CustomerID', how='left')  
customer_features = customer_features.merge(customer_frequency, on='CustomerID', how='left')  
customer_features = customer_features.merge(last_purchase, on='CustomerID', how='left')

In [18]:
# Merge customer features with recency days
customer_features = customer_features.merge(favorite_category, on='CustomerID', how='left') 

In [19]:
# Fill NaN values with 0 (for customers with no transactions or missing data)
customer_features.fillna(0, inplace=True) 

In [20]:
# Standardize numerical features (scaling to mean=0 and std=1)
numerical_cols = ['TenureDays', 'TotalSpending', 'AvgOrderValue', 'NumTransactions', 'RecencyDays']  
scaler = StandardScaler()  
customer_features[numerical_cols] = scaler.fit_transform(customer_features[numerical_cols])  

In [21]:
# Prepare the data for similarity calculation
customer_ids = customer_features['CustomerID']  
features = customer_features.drop('CustomerID', axis=1)  

In [22]:
# Calculate cosine similarity between customers
similarity_matrix = cosine_similarity(features)

In [23]:
# Create mappings between customer indices and IDs
customer_index_to_id = dict(zip(range(len(customer_ids)), customer_ids))  
customer_id_to_index = dict(zip(customer_ids, range(len(customer_ids))))  

In [24]:
# Define the number of similar customers to recommend
top_n = 3  
lookalikes = {}  

In [25]:
# Define the target customers (first 20 customers: C0001 to C0020)
target_customers = customer_ids[customer_ids.isin(['C' + str(i).zfill(4) for i in range(1, 21)])] 

In [26]:
# Find lookalikes for each target customer
for cust_id in target_customers:  
    idx = customer_id_to_index[cust_id]  
    # Similarities for this customer  
    sim_scores = list(enumerate(similarity_matrix[idx]))  
    # Exclude the customer themselves  
    sim_scores = [(i, score) for i, score in sim_scores if i != idx]  
    # Sort by similarity score in descending order  
    sim_scores.sort(key=lambda x: x[1], reverse=True)  
    # Get top N similar customers  
    top_similar = sim_scores[:top_n]  
    # Map indices back to customer IDs and round similarity scores  
    lookalike_list = [(customer_index_to_id[i], round(score, 4)) for i, score in top_similar]  
    lookalikes[cust_id] = lookalike_list  

In [27]:
# Prepare the lookalike data for saving to a CSV file
lookalike_data = []  
for cust_id, similars in lookalikes.items():  
    # Create a string representation of the list of tuples  
    similars_str = str(similars)  
    lookalike_data.append({'CustomerID': cust_id, 'Lookalikes': similars_str})  

In [28]:
# Convert the lookalike data to a DataFrame
lookalike_df = pd.DataFrame(lookalike_data)  

In [29]:
# Save the lookalike recommendations to a CSV file
lookalike_df.to_csv('Lookalike.csv', index=False)  