In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [2]:
# Load datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

In [3]:
# Preprocessing datasets
customers["SignupDate"] = pd.to_datetime(customers["SignupDate"])
transactions["TransactionDate"] = pd.to_datetime(transactions["TransactionDate"])

In [4]:
# Merge datasets
merged_data = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")

In [5]:
# Aggregate transaction data for customers
customer_transactions = merged_data.groupby("CustomerID").agg(
    TotalSpent=("TotalValue", "sum"),
    AvgSpent=("TotalValue", "mean"),
    TotalQuantity=("Quantity", "sum"),
    NumTransactions=("TransactionID", "nunique"),
).reset_index()

In [6]:
# Encode and scale customer profile features
customer_profiles = customers.merge(customer_transactions, on="CustomerID", how="left").fillna(0)
customer_profiles["DaysSinceSignup"] = (pd.Timestamp.now() - customer_profiles["SignupDate"]).dt.days

In [7]:
# Define features for lookalike modeling
profile_features = ["Region"]
numeric_features = ["TotalSpent", "AvgSpent", "TotalQuantity", "NumTransactions", "DaysSinceSignup"]

In [8]:
# Create a preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(), profile_features),
    ]
)

In [9]:
# Transform customer profiles into feature vectors
feature_pipeline = Pipeline(steps=[("preprocessor", preprocessor)])
customer_feature_matrix = feature_pipeline.fit_transform(customer_profiles)

In [10]:
# Compute cosine similarity between all customers
similarity_matrix = cosine_similarity(customer_feature_matrix)

# Create a DataFrame to store lookalike results
lookalikes = {}

In [11]:
# Find top 3 similar customers for each of the first 20 customers (C0001 - C0020)
for i in range(20):
    customer_id = customer_profiles.iloc[i]["CustomerID"]
    similarity_scores = similarity_matrix[i]
    
    # Get top 3 most similar customers (excluding the customer itself)
    top_indices = np.argsort(similarity_scores)[::-1][1:4]
    top_customers = [
        (customer_profiles.iloc[idx]["CustomerID"], round(similarity_scores[idx], 3)) 
        for idx in top_indices
    ]
    
    lookalikes[customer_id] = top_customers


In [15]:
# Convert the lookalike results into a DataFrame
lookalike_df = pd.DataFrame(
    {"CustomerID": lookalikes.keys(), "Lookalikes": [str(val) for val in lookalikes.values()]}
)

# Save the lookalike results to Lookalike.csv
lookalike_df.to_csv("Lookalike.csv", index=False)

print("Lookalike Model completed. Results saved to 'Lookalike.csv'")

Lookalike Model completed. Results saved to 'Lookalike.csv'
