In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import cdist

# Step 1: Load the datasets
customers_df = pd.read_csv('Customers.csv')
products_df = pd.read_csv('Products.csv')
transactions_df = pd.read_csv('Transactions.csv')

# Step 2: Feature Engineering
def create_customer_features():
    # Convert date columns to datetime format
    transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'], errors='coerce')
    customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'], errors='coerce')

    # Merge datasets to create a consolidated view
    merged_data = transactions_df.merge(customers_df, on='CustomerID').merge(products_df, on='ProductID')

    # Aggregate data to compute key customer metrics
    customer_metrics = merged_data.groupby('CustomerID').agg({
        'TotalValue': ['sum', 'mean', 'count'],
        'Quantity': ['sum', 'mean'],
        'Category': lambda x: x.mode().iloc[0] if not x.mode().empty else None,
        'TransactionDate': ['min', 'max']
    }).reset_index()

    # Rename columns for clarity
    customer_metrics.columns = [
        'CustomerID', 'total_spend', 'avg_transaction', 'transaction_count',
        'total_quantity', 'avg_quantity', 'favorite_category',
        'first_purchase', 'last_purchase'
    ]

    # Calculate customer lifetime
    customer_metrics['customer_lifetime'] = (
        pd.to_datetime(customer_metrics['last_purchase']) -
        pd.to_datetime(customer_metrics['first_purchase'])
    ).dt.days.fillna(0)

    # Drop unnecessary columns
    customer_metrics = customer_metrics.drop(['first_purchase', 'last_purchase'], axis=1)

    # Merge with region information
    customer_features = customer_metrics.merge(customers_df[['CustomerID', 'Region']], on='CustomerID')

    # One-hot encode categorical columns
    customer_features = pd.get_dummies(customer_features, columns=['Region', 'favorite_category'])

    # Standardize numerical features
    scaler = StandardScaler()
    numerical_cols = ['total_spend', 'avg_transaction', 'transaction_count', 'total_quantity', 'avg_quantity', 'customer_lifetime']
    customer_features[numerical_cols] = scaler.fit_transform(customer_features[numerical_cols])

    return customer_features

# Step 3: Lookalike Recommendation
def find_lookalikes(customer_features, customer_id, n_recommendations=3):
    # Extract the target customer's feature vector
    target_customer = customer_features[customer_features['CustomerID'] == customer_id].drop('CustomerID', axis=1)

    # Extract other customers' features
    other_customers = customer_features[customer_features['CustomerID'] != customer_id]
    other_customer_ids = other_customers['CustomerID']
    other_features = other_customers.drop('CustomerID', axis=1)

    # Compute cosine similarity
    similarity_scores = 1 - cdist(target_customer.values, other_features.values, metric='cosine')

    # Get the top N similar customers
    top_indices = similarity_scores[0].argsort()[-n_recommendations:][::-1]
    similar_customers = other_customer_ids.iloc[top_indices].values
    similarity_values = similarity_scores[0][top_indices]

    # Return as a DataFrame
    return pd.DataFrame({
        'similar_customer_id': similar_customers,
        'similarity_score': similarity_values
    })

# Step 4: Generate Recommendations for the First 20 Customers
def generate_lookalike_recommendations():
    # Create customer features
    customer_features = create_customer_features()

    # Select the first 20 customers
    target_customers = customers_df['CustomerID'].iloc[:20]

    recommendations = []
    for customer_id in target_customers:
        similar_customers = find_lookalikes(customer_features, customer_id)
        similar_customers['reference_customer_id'] = customer_id
        recommendations.append(similar_customers)

    # Combine all recommendations into one DataFrame
    return pd.concat(recommendations, ignore_index=True)

if __name__ == "__main__":
    # Generate the recommendations
    lookalike_recommendations = generate_lookalike_recommendations()

    # Save the results to a CSV file
    lookalike_recommendations.to_csv('./FirstName_LastName_Lookalike.csv', index=False)

    # Display the first 10 rows of recommendations
    print("Top 10 Recommendations:")
    print(lookalike_recommendations.head(10))


Top 10 Recommendations:
  similar_customer_id  similarity_score reference_customer_id
0               C0048          0.941520                 C0001
1               C0181          0.849851                 C0001
2               C0190          0.722234                 C0001
3               C0088          0.800991                 C0002
4               C0056          0.789995                 C0002
5               C0029          0.781433                 C0002
6               C0151          0.718017                 C0003
7               C0176          0.705473                 C0003
8               C0110          0.667138                 C0003
9               C0165          0.970806                 C0004
