In [2]:
# import libraries

import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler


In [3]:
# Data Preparation : Load Datasets

customers_df = pd.read_csv('Customers.csv')
products_df = pd.read_csv('Products.csv')
transactions_df = pd.read_csv('Transactions.csv')


In [4]:
# Preprocess the data

customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])     # Convert 'SignupDate' column in the customers DataFrame to datetime objects.

transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])  # This conversion is necessary for accurate date calculations and filtering.


In [5]:
# Group transactions by 'CustomerID' & aggregate the data.
# Calculate sum, mean, & count of 'TotalValue' for each customer.
# Calculate sum of 'Quantity' for each customer.

customer_transactions = transactions_df.groupby('CustomerID').agg({
    'TotalValue': ['sum', 'mean', 'count'],
    'Quantity': 'sum'}).reset_index()

customer_transactions.columns = ['CustomerID', 'TotalSpending', 'AverageSpending', 'TransactionCount', 'TotalQuantity']


In [6]:
# Merge customer profiles with customers data

customer_profiles = pd.merge(customers_df, customer_transactions, on='CustomerID', how='left').fillna(0)


In [7]:
# Encode categorical data 

customer_profiles = pd.get_dummies(customer_profiles, columns=['Region'])


In [8]:
# Scale numerical features

scaler = StandardScaler()
numerical_features = ['TotalSpending', 'AverageSpending', 'TransactionCount', 'TotalQuantity']
customer_profiles[numerical_features] = scaler.fit_transform(customer_profiles[numerical_features])


In [9]:
# Compute the cosine similarity between customer profiles

similarities = cosine_similarity(customer_profiles[numerical_features + list(customer_profiles.columns[5:])])
similarities_df = pd.DataFrame(similarities, index=customer_profiles['CustomerID'], columns=customer_profiles['CustomerID'])


In [10]:
# Generate recommendations

lookalike = {}
for cust_id in customer_profiles['CustomerID']:
    similar_customers = similarities_df[cust_id].sort_values(ascending=False).index[1:4]
    scores = similarities_df[cust_id].sort_values(ascending=False)[1:4]
    lookalike[cust_id] = list(zip(similar_customers, scores))


In [11]:
# Create Lookalike.csv file

lookalike_df = pd.DataFrame.from_dict(lookalike, orient='index')

lookalike_df.to_csv('Lookalike.csv', header=False)


In [12]:
# Display the top 3 lookalikes for first 20 customers

for cust_id in customer_profiles['CustomerID'][:20]:
    print(f"CustomerID: {cust_id}, Lookalikes: {lookalike[cust_id]}")

CustomerID: C0001, Lookalikes: [('C0137', 0.9775593421705276), ('C0107', 0.9660981110817161), ('C0191', 0.932692486075793)]
CustomerID: C0002, Lookalikes: [('C0142', 0.9748396492131094), ('C0043', 0.9543209037226013), ('C0088', 0.929764143050988)]
CustomerID: C0003, Lookalikes: [('C0133', 0.8482394430385068), ('C0190', 0.8193027937100822), ('C0052', 0.7641516214061519)]
CustomerID: C0004, Lookalikes: [('C0113', 0.9919831325606134), ('C0165', 0.9867413425381364), ('C0102', 0.9754527861886699)]
CustomerID: C0005, Lookalikes: [('C0159', 0.9959183324911365), ('C0123', 0.9890116006667962), ('C0186', 0.9848296329665001)]
CustomerID: C0006, Lookalikes: [('C0168', 0.9769246041520829), ('C0187', 0.8983433828756263), ('C0158', 0.8903363665399748)]
CustomerID: C0007, Lookalikes: [('C0140', 0.9856485193635803), ('C0080', 0.9677590962178623), ('C0110', 0.964727389608303)]
CustomerID: C0008, Lookalikes: [('C0109', 0.9210320015533267), ('C0084', 0.9170504836555459), ('C0194', 0.9129184183036483)]
Cus