In [1]:
#!pip install scikit-learn

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder

In [4]:
# One-hot encode the "Region" column
encoder = OneHotEncoder()
region_encoded = encoder.fit_transform(customers[["Region"]]).toarray()
region_df = pd.DataFrame(region_encoded, columns=encoder.get_feature_names_out(["Region"]))
customers = pd.concat([customers, region_df], axis=1)


In [5]:
# Aggregate transaction data
transaction_summary = transactions.groupby(["CustomerID", "ProductID"]).agg(
    {"Quantity": "sum", "TotalValue": "sum"}
).unstack(fill_value=0)

In [6]:
# Flatten the multi-level column index and reset index
transaction_summary.columns = [f"{col[0]}_{col[1]}" for col in transaction_summary.columns]
transaction_summary = transaction_summary.reset_index()


In [7]:
# Merge customer profiles with transaction data
customer_profiles = customers.set_index("CustomerID")
transaction_summary = transaction_summary.set_index("CustomerID")
full_data = customer_profiles.join(transaction_summary, how="left").fillna(0)


In [8]:
# Compute cosine similarity between all customers
def calculate_cosine_similarity(full_data):
    numerical_data = full_data.select_dtypes(include=[np.number])
    similarity_matrix = cosine_similarity(numerical_data)
    return similarity_matrix


In [9]:
#Compute lookalikes for the first 20 customers
def compute_top_lookalikes(full_data, top_n=3):
    similarity_matrix = calculate_cosine_similarity(full_data)
    lookalike_map = {}
    customer_ids = full_data.index

    for i, cust_id in enumerate(customer_ids[:20]):
        similarities = [(customer_ids[j], similarity_matrix[i, j]) for j in range(len(customer_ids)) if i != j]
        # Sort by similarity score and get the top N lookalikes
        top_lookalikes = sorted(similarities, key=lambda x: x[1], reverse=True)[:top_n]
        lookalike_map[cust_id] = top_lookalikes

    return lookalike_map

In [10]:
# Generate the lookalike map
lookalike_map = compute_top_lookalikes(full_data)

In [11]:
# Convert to the required format
lookalike_list = []
for cust_id, lookalikes in lookalike_map.items():
    lookalike_str = {cust_id: [(other_id, round(score, 4)) for other_id, score in lookalikes]}
    lookalike_list.append(lookalike_str)

In [12]:
# Save to Lookalike.csv
lookalike_df = pd.DataFrame({'lookalikeMap': lookalike_list})
lookalike_df.to_csv('lookalike.csv', index=False)