In [5]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

# Load the datasets
customers_df = pd.read_csv('Customers.csv')
products_df = pd.read_csv('Products.csv')
transactions_df = pd.read_csv('Transactions.csv')

# Convert dates to datetime
customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])

def create_customer_features(customers_df, transactions_df, products_df):
    """
    Create feature matrix for customers based on their profile and transaction history
    """
    # Merge transactions with products
    trans_prod = transactions_df.merge(products_df, on='ProductID')
    
    # Calculate customer transaction features
    customer_features = pd.DataFrame()
    
    # Basic transaction metrics
    transaction_metrics = trans_prod.groupby('CustomerID').agg({
        'TransactionID': 'count',          # Number of transactions
        'Quantity': ['sum', 'mean'],       # Total items and average items per transaction
        'TotalValue': ['sum', 'mean']      # Total spend and average order value
    })
    
    # Flatten column names
    transaction_metrics.columns = [
        'transaction_count',
        'total_items',
        'avg_items_per_transaction',
        'total_spend',
        'avg_order_value'
    ]
    
    # Category preferences (percentage of spend in each category)
    category_spend = trans_prod.pivot_table(
        index='CustomerID',
        columns='Category',
        values='TotalValue',
        aggfunc='sum',
        fill_value=0
    )
    
    # Calculate percentage spend in each category
    category_spend_pct = category_spend.div(category_spend.sum(axis=1), axis=0)
    category_spend_pct.columns = [f'pct_spend_{col.lower()}' for col in category_spend_pct.columns]
    
    # Combine all features
    customer_features = transaction_metrics.join(category_spend_pct)
    
    # Add recency and frequency features
    last_transaction = trans_prod.groupby('CustomerID')['TransactionDate'].max()
    first_transaction = trans_prod.groupby('CustomerID')['TransactionDate'].min()
    customer_features['days_since_last_purchase'] = (pd.Timestamp.now() - last_transaction).dt.days
    customer_features['customer_age_days'] = (last_transaction - first_transaction).dt.days
    
    # Add region as dummy variables
    region_dummies = pd.get_dummies(customers_df.set_index('CustomerID')['Region'], prefix='region')
    customer_features = customer_features.join(region_dummies)
    
    # Add signup date features
    signup_dates = customers_df.set_index('CustomerID')['SignupDate']
    customer_features['days_since_signup'] = (pd.Timestamp.now() - signup_dates).dt.days
    
    return customer_features

def get_lookalikes(customer_id, scaled_features_df, n_recommendations=3):
    """
    Find similar customers based on cosine similarity
    """
    # Calculate cosine similarity
    customer_vector = scaled_features_df.loc[customer_id].values.reshape(1, -1)
    similarities = cosine_similarity(customer_vector, scaled_features_df)
    
    # Get top similar customers (excluding the customer themselves)
    similar_indices = similarities[0].argsort()[::-1][1:n_recommendations+1]
    similar_scores = similarities[0][similar_indices]
    
    # Get the customer IDs
    similar_customers = scaled_features_df.index[similar_indices]
    
    return list(zip(similar_customers, similar_scores))

# Create feature matrix
customer_features = create_customer_features(customers_df, transactions_df, products_df)

# Scale the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features)
scaled_features_df = pd.DataFrame(
    scaled_features, 
    index=customer_features.index, 
    columns=customer_features.columns
)

# Generate lookalikes for first 20 customers
lookalike_results = {}
for i in range(1, 21):
    customer_id = f'C{i:04d}'
    if customer_id in scaled_features_df.index:
        # Pass scaled_features_df to get_lookalikes function
        lookalikes = get_lookalikes(customer_id, scaled_features_df)
        lookalike_results[customer_id] = [
            {'customer_id': cust_id, 'similarity_score': float(score)}
            for cust_id, score in lookalikes
        ]

# Create DataFrame for results
results = []
for cust_id, recommendations in lookalike_results.items():
    rec_str = ';'.join([
        f"{rec['customer_id']},{rec['similarity_score']:.4f}" 
        for rec in recommendations
    ])
    results.append({
        'customer_id': cust_id,
        'lookalikes': rec_str
    })

# Create and save Lookalike.csv
lookalike_df = pd.DataFrame(results)
lookalike_df.to_csv('Lookalike.csv', index=False)

# Print results for verification
print("Lookalike recommendations for first 20 customers:")
for cust_id, recs in lookalike_results.items():
    print(f"\nCustomer {cust_id}:")
    for i, rec in enumerate(recs, 1):
        print(f"  {i}. Customer {rec['customer_id']} (Similarity: {rec['similarity_score']:.4f})")

Lookalike recommendations for first 20 customers:

Customer C0001:
  1. Customer C0192 (Similarity: 0.7786)
  2. Customer C0120 (Similarity: 0.7604)
  3. Customer C0181 (Similarity: 0.7597)

Customer C0002:
  1. Customer C0159 (Similarity: 0.9344)
  2. Customer C0106 (Similarity: 0.9081)
  3. Customer C0134 (Similarity: 0.8854)

Customer C0003:
  1. Customer C0091 (Similarity: 0.7624)
  2. Customer C0163 (Similarity: 0.6903)
  3. Customer C0148 (Similarity: 0.6791)

Customer C0004:
  1. Customer C0113 (Similarity: 0.9241)
  2. Customer C0165 (Similarity: 0.8709)
  3. Customer C0188 (Similarity: 0.8068)

Customer C0005:
  1. Customer C0007 (Similarity: 0.9033)
  2. Customer C0140 (Similarity: 0.7592)
  3. Customer C0088 (Similarity: 0.7052)

Customer C0006:
  1. Customer C0187 (Similarity: 0.7058)
  2. Customer C0169 (Similarity: 0.6731)
  3. Customer C0171 (Similarity: 0.6473)

Customer C0007:
  1. Customer C0005 (Similarity: 0.9033)
  2. Customer C0140 (Similarity: 0.7719)
  3. Custom