In [2]:
import pandas as pd


customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

In [8]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity


customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")


customers["SignupDate"] = pd.to_datetime(customers["SignupDate"])
transactions["TransactionDate"] = pd.to_datetime(transactions["TransactionDate"])


merged_data = pd.merge(transactions, customers, on="CustomerID", how="left")
merged_data = pd.merge(merged_data, products, on="ProductID", how="left")


customer_last_transaction = merged_data.groupby("CustomerID")["TransactionDate"].max().reset_index()
customer_features = pd.merge(customers, customer_last_transaction, on="CustomerID")
customer_features["Tenure"] = (customer_features["TransactionDate"] - customer_features["SignupDate"]).dt.days

In [10]:



transaction_agg = merged_data.groupby("CustomerID").agg(
    TotalTransactions=("TransactionID", "count"),
    TotalSpend=("TotalValue", "sum"),
    AvgTransactionValue=("TotalValue", "mean"),
    FavoriteCategory=("Category", lambda x: x.mode()[0]),
    AvgProductPrice=("Price_y", "mean"),
    UniqueProducts=("ProductID", "nunique")
).reset_index()


customer_features = pd.merge(customer_features, transaction_agg, on="CustomerID")

In [12]:

encoder = OneHotEncoder(sparse_output=False)
encoded_region = encoder.fit_transform(customer_features[["Region"]])
encoded_region_df = pd.DataFrame(encoded_region, columns=encoder.get_feature_names_out(["Region"]))


numerical_features = customer_features[["Tenure", "TotalSpend", "AvgTransactionValue", "AvgProductPrice", "UniqueProducts"]]
scaler = StandardScaler()
scaled_numerical = scaler.fit_transform(numerical_features)
scaled_numerical_df = pd.DataFrame(scaled_numerical, columns=numerical_features.columns)

final_features = pd.concat([customer_features["CustomerID"], encoded_region_df, scaled_numerical_df], axis=1)

In [14]:

similarity_matrix = cosine_similarity(final_features.drop("CustomerID", axis=1))


customer_id_to_index = {cust_id: idx for idx, cust_id in enumerate(final_features["CustomerID"])}


lookalike_map = {}
target_customers = [f"C{str(i).zfill(4)}" for i in range(1, 21)]

for cust_id in target_customers:
    if cust_id not in customer_id_to_index:
        continue  
    idx = customer_id_to_index[cust_id]
    similarities = list(enumerate(similarity_matrix[idx]))
    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)
  
    top_matches = [item for item in similarities if final_features.iloc[item[0]]["CustomerID"] != cust_id][:3]
    lookalike_map[cust_id] = [
        (final_features.iloc[match[0]]["CustomerID"], round(match[1], 3))
        for match in top_matches
    ]

In [16]:

import csv

with open("FirstName_LastName_Lookalike.csv", "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["CustomerID", "Lookalikes"])
    for cust_id, matches in lookalike_map.items():
        lookalikes_str = ", ".join([f"({match[0]}, {match[1]})" for match in matches])
        writer.writerow([cust_id, lookalikes_str])