In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Loading the datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

# Merging datasets to create a unified view
transactions = transactions.merge(customers, on="CustomerID", how="left")
transactions = transactions.merge(products, on="ProductID", how="left")

# Aggregating transaction data to form customer profiles
customer_profiles = transactions.groupby("CustomerID").agg(
    {
        "TotalValue": "sum",  # Total spending
        "Quantity": "sum",  # Total items purchased
        "Category": lambda x: list(x),  # List of purchased categories
        "Region": "first",  # Customer region
    }
).reset_index()

# One-hot encode categorical features
encoder = OneHotEncoder(sparse_output=False)  # Updated parameter
region_encoded = encoder.fit_transform(customer_profiles[["Region"]])

# One-hot encode product categories
category_encoded = pd.get_dummies(
    customer_profiles["Category"].apply(lambda x: ",".join(x)).str.get_dummies(sep=",")
)

# Combine all features
features = pd.concat(
    [
        customer_profiles[["TotalValue", "Quantity"]],
        pd.DataFrame(region_encoded, columns=encoder.get_feature_names_out(["Region"])),
        category_encoded,
    ],
    axis=1,
)

# Normalize numerical features
scaler = StandardScaler()
features[["TotalValue", "Quantity"]] = scaler.fit_transform(
    features[["TotalValue", "Quantity"]]
)

# Compute similarity matrix
similarity_matrix = cosine_similarity(features)

# Find the top 3 similar customers for the first 20 customers
top_customers = customer_profiles["CustomerID"][:20]
lookalike_map = {}

for idx, cust_id in enumerate(top_customers):
    similar_indices = np.argsort(similarity_matrix[idx])[::-1][1:4]
    similar_customers = customer_profiles.iloc[similar_indices]["CustomerID"].values
    similarity_scores = similarity_matrix[idx][similar_indices]
    lookalike_map[cust_id] = list(zip(similar_customers, similarity_scores))

# Saving the results to the file - Lookalike.csv
lookalike_df = pd.DataFrame(
    {
        "CustomerID": list(lookalike_map.keys()),
        "Lookalikes": [
            [{"CustomerID": cid, "Score": score} for cid, score in value]
            for value in lookalike_map.values()
        ],
    }
)
lookalike_df.to_csv("Lookalike.csv", index=False)

print("Lookalike.csv has been created successfully!")


Lookalike.csv has been created successfully!
