In [1]:
# Required Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
# Load datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

# Merge transactions with customer and product data
merged_data = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")

# Aggregate transactional data for each customer
customer_summary = merged_data.groupby("CustomerID").agg({
    "TotalValue": "sum",   # Total spending
    "Quantity": "sum",     # Total items purchased
}).reset_index()

# Add customer profile features (Region and SignupDate)
customer_profiles = customers.merge(customer_summary, on="CustomerID")
customer_profiles = pd.get_dummies(customer_profiles, columns=["Region"], drop_first=True)


In [3]:
# Standardize numeric columns for similarity calculation
scaler = StandardScaler()
numeric_cols = ["TotalValue", "Quantity"]
customer_profiles_scaled = customer_profiles.copy()
customer_profiles_scaled[numeric_cols] = scaler.fit_transform(customer_profiles[numeric_cols])


In [4]:
# Compute cosine similarity matrix
similarity_matrix = cosine_similarity(customer_profiles_scaled[numeric_cols])

# Convert similarity matrix to DataFrame for easier access
similarity_df = pd.DataFrame(
    similarity_matrix, 
    index=customer_profiles["CustomerID"], 
    columns=customer_profiles["CustomerID"]
)


In [5]:
# Function to get top 3 similar customers for a given customer
def get_top_3_similar(customer_id):
    # Sort similarity scores in descending order and exclude the customer itself
    similar_customers = similarity_df[customer_id].sort_values(ascending=False).iloc[1:4]
    return [(cust_id, round(score, 3)) for cust_id, score in similar_customers.items()]

# Generate lookalike recommendations for the first 20 customers
first_20_customers = customers["CustomerID"][:20]
lookalike_data = {}
for customer_id in first_20_customers:
    lookalike_data[customer_id] = get_top_3_similar(customer_id)


In [6]:
# Convert lookalike data to a DataFrame
lookalike_output = pd.DataFrame([
    {"CustomerID": cust_id, "Lookalikes": str(recommendations)}
    for cust_id, recommendations in lookalike_data.items()
])

# Save the results to a CSV file
lookalike_output.to_csv("Firstname_Lastname_Lookalike.csv", index=False)

# Display the first few rows of the output
print(lookalike_output.head())


  CustomerID                                         Lookalikes
0      C0001   [('C0085', 1.0), ('C0042', 1.0), ('C0089', 1.0)]
1      C0002   [('C0157', 1.0), ('C0166', 1.0), ('C0029', 1.0)]
2      C0003  [('C0111', 0.994), ('C0160', 0.99), ('C0147', ...
3      C0004  [('C0162', 1.0), ('C0165', 1.0), ('C0090', 0.9...
4      C0005   [('C0080', 1.0), ('C0167', 1.0), ('C0177', 1.0)]
