In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

# Convert 'SignupDate' and 'TransactionDate' to datetime
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

# Merge datasets: Transactions with Customers and Products
merged_data = transactions.merge(customers, on="CustomerID", how="left")
merged_data = merged_data.merge(products, on="ProductID", how="left")

# Feature Engineering
# 1. Days since Signup
merged_data['DaysSinceSignup'] = (pd.to_datetime('today') - merged_data['SignupDate']).dt.days

# 2. One-hot encode Region and Product Category
encoder = OneHotEncoder(sparse_output=False)  # Use sparse_output=False for dense matrix
region_encoded = encoder.fit_transform(merged_data[['Region']])
category_encoded = encoder.fit_transform(merged_data[['Category']])

# Manually create column names for the one-hot encoded features
region_columns = [f"Region_{category}" for category in encoder.categories_[0]]
category_columns = [f"Category_{category}" for category in encoder.categories_[0]]

# Create DataFrames for the encoded features
region_encoded_df = pd.DataFrame(region_encoded, columns=region_columns)
category_encoded_df = pd.DataFrame(category_encoded, columns=category_columns)

# Concatenate all features into one dataframe
merged_data_encoded = pd.concat([merged_data, region_encoded_df, category_encoded_df], axis=1)

# Select the features for similarity calculation
features = ['DaysSinceSignup'] + region_columns + category_columns
merged_data_features = merged_data_encoded[features]

# 3. Calculate Cosine Similarity Matrix
cosine_sim = cosine_similarity(merged_data_features)

# Create a function to get top 3 lookalikes for a given customer
def get_lookalikes(customer_id, top_n=3):
    # Find the index of the customer
    customer_idx = merged_data[merged_data['CustomerID'] == customer_id].index[0]
    
    # Get the cosine similarity scores for this customer
    similarity_scores = list(enumerate(cosine_sim[customer_idx]))
    
    # Sort the customers based on similarity score, excluding the customer itself
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    # Get the top N similar customers
    lookalikes = similarity_scores[1:top_n+1]  # Skip the first entry, which is the customer itself
    lookalike_customers = [(merged_data.iloc[i[0]]['CustomerID'], i[1]) for i in lookalikes]
    
    return lookalike_customers

# Create the lookalike mapping for the first 20 customers
lookalike_mapping = {}
for customer_id in customers['CustomerID'][:20]:
    lookalikes = get_lookalikes(customer_id)
    lookalike_mapping[customer_id] = lookalikes

# Prepare the Lookalike CSV content
lookalike_list = []
for customer_id, lookalikes in lookalike_mapping.items():
    # Format: [(Lookalike1_ID, Score), (Lookalike2_ID, Score), (Lookalike3_ID, Score)]
    lookalike_list.append([customer_id, str(lookalikes)])

# Save the lookalike mapping to a CSV file
lookalike_df = pd.DataFrame(lookalike_list, columns=["CustomerID", "Lookalikes"])
lookalike_df.to_csv("Lookalike.csv", index=False)

# Output the Lookalike CSV file location
print("Lookalike.csv file has been generated.")


Lookalike.csv file has been generated.
