In [1]:
# Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Load datasets
customers = pd.read_csv('/content/Customers.csv')
products = pd.read_csv('/content/Products.csv')
transactions = pd.read_csv('/content/Transactions.csv')


# Merge datasets
transactions_products = pd.merge(transactions, products, on="ProductID")
customer_data = pd.merge(transactions_products, customers, on="CustomerID")

# Aggregate customer profiles
customer_profiles = customer_data.groupby("CustomerID").agg({
    "TotalValue": "sum",             # Total spending by customer
    "Quantity": "sum",               # Total quantity purchased by customer
    "Price_y": "mean",               # Average price of products bought
    "Category": lambda x: x.mode()[0],  # Most common product category
}).reset_index()

# Rename columns for clarity
customer_profiles.rename(columns={"Price_y": "AvgPrice"}, inplace=True)

# Add region information from customers dataset
customer_profiles = pd.merge(customer_profiles, customers[["CustomerID", "Region"]], on="CustomerID")

# One-hot encode categorical features (e.g., Region, Category)
customer_profiles_encoded = pd.get_dummies(customer_profiles, columns=["Region", "Category"], drop_first=True)

# Standardize numerical features
scaler = StandardScaler()
numerical_features = ["TotalValue", "Quantity", "AvgPrice"]
customer_profiles_encoded[numerical_features] = scaler.fit_transform(customer_profiles_encoded[numerical_features])

# Compute pairwise similarity
similarity_matrix = cosine_similarity(customer_profiles_encoded.drop("CustomerID", axis=1))

# Create a DataFrame for similarity
similarity_df = pd.DataFrame(similarity_matrix, index=customer_profiles["CustomerID"], columns=customer_profiles["CustomerID"])

# Recommend top 3 similar customers for each of the first 20 customers
top_20_customers = customer_profiles["CustomerID"][:20]
lookalike_map = {}

for customer_id in top_20_customers:
    # Sort similarity scores in descending order, exclude the customer itself
    similar_customers = similarity_df[customer_id].sort_values(ascending=False).drop(customer_id).head(3)
    lookalike_map[customer_id] = list(similar_customers.items())

# Save results to Lookalike.csv
lookalike_results = []

for cust_id, recommendations in lookalike_map.items():
    for rec_cust_id, score in recommendations:
        lookalike_results.append({"CustomerID": cust_id, "SimilarCustomerID": rec_cust_id, "SimilarityScore": score})

lookalike_df = pd.DataFrame(lookalike_results)
lookalike_df.to_csv("Lookalike.csv", index=False)

print("Lookalike recommendations have been saved to Lookalike.csv!")
# Print the lookalike recommendations
for cust_id, recommendations in lookalike_map.items():
    print(f"CustomerID: {cust_id}")
    for rec_cust_id, score in recommendations:
        print(f"  SimilarCustomerID: {rec_cust_id}, SimilarityScore: {score:.4f}")
    print("-" * 50)

Lookalike recommendations have been saved to Lookalike.csv!
CustomerID: C0001
  SimilarCustomerID: C0181, SimilarityScore: 0.9347
  SimilarCustomerID: C0120, SimilarityScore: 0.8975
  SimilarCustomerID: C0184, SimilarityScore: 0.8641
--------------------------------------------------
CustomerID: C0002
  SimilarCustomerID: C0088, SimilarityScore: 0.9848
  SimilarCustomerID: C0077, SimilarityScore: 0.9007
  SimilarCustomerID: C0144, SimilarityScore: 0.8962
--------------------------------------------------
CustomerID: C0003
  SimilarCustomerID: C0031, SimilarityScore: 0.8639
  SimilarCustomerID: C0025, SimilarityScore: 0.8622
  SimilarCustomerID: C0052, SimilarityScore: 0.8448
--------------------------------------------------
CustomerID: C0004
  SimilarCustomerID: C0165, SimilarityScore: 0.9826
  SimilarCustomerID: C0153, SimilarityScore: 0.9128
  SimilarCustomerID: C0169, SimilarityScore: 0.9081
--------------------------------------------------
CustomerID: C0005
  SimilarCustomerID: C