In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Load datasets
customers_url = "https://drive.google.com/uc?id=1bu_--mo79VdUG9oin4ybfFGRUSXAe-WE"
products_url = "https://drive.google.com/uc?id=1IKuDizVapw-hyktwfpoAoaGtHtTNHfd0"
transactions_url = "https://drive.google.com/uc?id=1saEqdbBB-vuk2hxoAf4TzDEsykdKlzbF"

customers = pd.read_csv(customers_url)
products = pd.read_csv(products_url)
transactions = pd.read_csv(transactions_url)

# Quick look at the data
print("Customers Data:")
print(customers.head())
print("\nProducts Data:")
print(products.head())
print("\nTransactions Data:")
print(transactions.head())

# Merge data for comprehensive analysis
merged_data = pd.merge(transactions, customers, on="CustomerID")
merged_data = pd.merge(merged_data, products, on="ProductID")

# Data Cleaning
merged_data['TransactionDate'] = pd.to_datetime(merged_data['TransactionDate'])
merged_data['SignupDate'] = pd.to_datetime(merged_data['SignupDate'])

# Feature Engineering for Lookalike Model
customer_features = merged_data.groupby('CustomerID').agg(
    total_spend=('TotalValue', 'sum'),
    transaction_count=('TransactionID', 'count'),
    avg_transaction_value=('TotalValue', 'mean'),
    unique_products=('ProductID', 'nunique')
).reset_index()

# Merge with customer profile information
customer_features = pd.merge(customer_features, customers, on="CustomerID")

# Encode categorical data
customer_features = pd.get_dummies(customer_features, columns=['Region'], drop_first=True)

# Standardize features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features.drop(columns=['CustomerID', 'CustomerName']))

# Compute similarity
similarity_matrix = cosine_similarity(scaled_features)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])

# Generate top 3 lookalikes for each customer
lookalike_results = {}
for customer_id in customer_features['CustomerID'][:20]:  # First 20 customers (C0001 - C0020)
    similar_customers = similarity_df[customer_id].sort_values(ascending=False)[1:4]  # Exclude self
    lookalike_results[customer_id] = list(similar_customers.items())

# Save results to CSV
lookalike_output = []
for cust_id, lookalikes in lookalike_results.items():
    for similar_cust, score in lookalikes:
        lookalike_output.append({'CustomerID': cust_id, 'SimilarCustomerID': similar_cust, 'SimilarityScore': score})

lookalike_df = pd.DataFrame(lookalike_output)
lookalike_df.to_csv('FirstName_LastName_Lookalike.csv', index=False)

# Print sample results
print(lookalike_df.head())

# Visualizations
# 1. Heatmap of Similarity Matrix
plt.figure(figsize=(12, 8))
sns.heatmap(similarity_matrix[:20, :20], cmap='viridis', xticklabels=customer_features['CustomerID'][:20], yticklabels=customer_features['CustomerID'][:20])
plt.title("Customer Similarity Heatmap (First 20 Customers)")
plt.xlabel("Customer ID")
plt.ylabel("Customer ID")
plt.show()