In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

class LookalikeModel:
    def __init__(self, customers, transactions, products):
        self.customers_df = customers
        self.transactions_df = transactions
        self.products_df = products
        self.feature_matrix = None
        self.similarity_matrix = None
        self.customer_indices = None

    def preprocess_data(self):
        """
        Prepares features from customer and transaction data.
        """
        # Convert dates to datetime
        self.customers_df['SignupDate'] = pd.to_datetime(self.customers_df['SignupDate'])
        self.transactions_df['TransactionDate'] = pd.to_datetime(self.transactions_df['TransactionDate'])

        customer_features = []

        for customer_id in self.customers_df['CustomerID']:
            # Filter customer transactions
            customer_transactions = self.transactions_df[self.transactions_df['CustomerID'] == customer_id]

            # Generate features
            features = {
                'total_spend': customer_transactions['TotalValue'].sum(),
                'avg_transaction_value': customer_transactions['TotalValue'].mean(),
                'num_transactions': len(customer_transactions),
                'num_unique_products': customer_transactions['ProductID'].nunique(),
                'days_since_signup': (pd.Timestamp.now() - self.customers_df[self.customers_df['CustomerID'] == customer_id]['SignupDate'].iloc[0]).days
            }

            # Category spend features
            customer_products = customer_transactions.merge(self.products_df, on='ProductID')
            category_spend = customer_products.groupby('Category')['TotalValue'].sum()
            for category in self.products_df['Category'].unique():
                features[f'spend_{category}'] = category_spend.get(category, 0)

            customer_features.append(features)

        # Convert features to a DataFrame
        feature_df = pd.DataFrame(customer_features)

        # Standardize features
        scaler = StandardScaler()
        self.feature_matrix = scaler.fit_transform(feature_df.fillna(0))
        self.customer_indices = {cust_id: idx for idx, cust_id in enumerate(self.customers_df['CustomerID'])}

    def calculate_similarity(self):
        """
        Compute pairwise cosine similarity for customers.
        """
        self.similarity_matrix = cosine_similarity(self.feature_matrix)

    def get_lookalikes(self, customer_id, n=3):
        """
        Get the top N lookalike customers for a given customer.
        """
        if customer_id not in self.customer_indices:
            return []

        customer_idx = self.customer_indices[customer_id]
        customer_similarities = self.similarity_matrix[customer_idx]

        # Exclude self and get top N similar customers
        similar_indices = customer_similarities.argsort()[::-1][1:n+1]
        similar_customers = [(self.customers_df['CustomerID'].iloc[idx], customer_similarities[idx]) for idx in similar_indices]

        return similar_customers

def generate_lookalike_csv(customers, transactions, products):
    """
    Generate a CSV file containing lookalike recommendations for the first 20 customers.
    """
    model = LookalikeModel(customers, transactions, products)
    model.preprocess_data()
    model.calculate_similarity()

    # Generate lookalikes for the first 20 customers
    results = []
    for customer_id in customers['CustomerID'][:20]:
        lookalikes = model.get_lookalikes(customer_id)
        row = {
            'cust_id': customer_id,
            'lookalike_1': lookalikes[0][0] if len(lookalikes) > 0 else None,
            'score_1': lookalikes[0][1] if len(lookalikes) > 0 else None,
            'lookalike_2': lookalikes[1][0] if len(lookalikes) > 1 else None,
            'score_2': lookalikes[1][1] if len(lookalikes) > 1 else None,
            'lookalike_3': lookalikes[2][0] if len(lookalikes) > 2 else None,
            'score_3': lookalikes[2][1] if len(lookalikes) > 2 else None
        }
        results.append(row)

    lookalike_df = pd.DataFrame(results)
    lookalike_df.to_csv('Prem_Kumar_Tiwari_Lookalike.csv', index=False)
    return lookalike_df

if __name__ == "__main__":
    # Load your datasets
    customers_df = pd.read_csv('Customers.csv')
    transactions_df = pd.read_csv('Transactions.csv')
    products_df = pd.read_csv('Products.csv')

    # Generate lookalike recommendations
    lookalike_results = generate_lookalike_csv(customers_df, transactions_df, products_df)
    print("Lookalike recommendations saved to 'Prem_Kumar_Tiwari_Lookalike.csv'")
    print("\nFirst few recommendations:")
    print(lookalike_results.head())


Lookalike recommendations saved to 'Prem_Kumar_Tiwari_Lookalike.csv'

First few recommendations:
  cust_id lookalike_1   score_1 lookalike_2   score_2 lookalike_3   score_3
0   C0001       C0091  0.896267       C0120  0.793542       C0069  0.791536
1   C0002       C0134  0.903862       C0029  0.891076       C0103  0.874419
2   C0003       C0085  0.859725       C0031  0.812542       C0026  0.798385
3   C0004       C0075  0.962785       C0113  0.851435       C0090  0.851212
4   C0005       C0197  0.940097       C0007  0.920369       C0166  0.891699
