In [8]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [9]:
# Load datasets
customers_data = pd.read_csv("Customers.csv")
transactions_data = pd.read_csv("Transactions.csv")
products_data = pd.read_csv("Products.csv")

In [10]:
# Merge datasets
merged_data = pd.merge(transactions_data, customers_data, on="CustomerID", how="left")
merged_data = pd.merge(merged_data, products_data, on="ProductID", how="left")

In [11]:
# Aggregate customer transaction data
customer_features = merged_data.groupby("CustomerID").agg({
    "TotalValue": ["sum", "mean"],
    "Quantity": "sum",
    "Category": lambda x: x.mode()[0] if not x.mode().empty else None,
    "Region": "first",
    "SignupDate": "first"
}).reset_index()


In [12]:
# Rename columns
customer_features.columns = ["CustomerID", "TotalSpending", "AvgSpending", "TotalQuantity", "PreferredCategory", "Region", "SignupDate"]

In [13]:
# Encode categorical features
customer_features = pd.get_dummies(customer_features, columns=["PreferredCategory", "Region"], drop_first=True)

In [14]:
# Transform signup date to numeric (days since the earliest signup date)
customer_features["SignupDate"] = pd.to_datetime(customer_features["SignupDate"])
customer_features["DaysSinceSignup"] = (customer_features["SignupDate"] - customer_features["SignupDate"].min()).dt.days
customer_features.drop(columns=["SignupDate"], inplace=True)

In [15]:
# Standardize numerical features
scaler = StandardScaler()
numerical_features = ["TotalSpending", "AvgSpending", "TotalQuantity", "DaysSinceSignup"]
customer_features[numerical_features] = scaler.fit_transform(customer_features[numerical_features])

In [16]:
# Compute similarity
customer_ids = customer_features["CustomerID"]
feature_matrix = customer_features.drop(columns=["CustomerID"]).values
similarity_matrix = cosine_similarity(feature_matrix)

In [17]:
# Find top 3 lookalikes for the first 20 customers
lookalike_map = {}
for idx, customer_id in enumerate(customer_ids[:20]):
    similarities = similarity_matrix[idx]
    similar_indices = np.argsort(similarities)[::-1][1:4]  # Top 3 excluding self
    lookalikes = [(customer_ids[i], similarities[i]) for i in similar_indices]
    lookalike_map[customer_id] = lookalikes

# Convert to DataFrame for saving
lookalike_df = pd.DataFrame({
    "CustomerID": lookalike_map.keys(),
    "Lookalikes": [str(v) for v in lookalike_map.values()]
})
lookalike_df.to_csv("Lookalike.csv", index=False)

print("Lookalike model completed. Results saved to Lookalike.csv.")


Lookalike model completed. Results saved to Lookalike.csv.


In [18]:
data = pd.read_csv("Lookalike.csv")

In [19]:
data.head()

Unnamed: 0,CustomerID,Lookalikes
0,C0001,"[('C0184', 0.9261277984486864), ('C0112', 0.88..."
1,C0002,"[('C0106', 0.9184655240349032), ('C0029', 0.91..."
2,C0003,"[('C0052', 0.9125768238525747), ('C0195', 0.86..."
3,C0004,"[('C0165', 0.9814788386801848), ('C0169', 0.94..."
4,C0005,"[('C0007', 0.9015156687755806), ('C0112', 0.87..."
