In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import json

class LookalikeModel:
    def __init__(self):
        self.customer_profiles = {}
        self.scaler = StandardScaler()

    def load_data(self):
        """Load and preprocess the data"""
        self.customers_df = pd.read_csv('Customers.csv')
        self.products_df = pd.read_csv('Products.csv')
        self.transactions_df = pd.read_csv('Transactions.csv')

        # Convert date columns
        self.customers_df['SignupDate'] = pd.to_datetime(self.customers_df['SignupDate'])
        self.transactions_df['TransactionDate'] = pd.to_datetime(self.transactions_df['TransactionDate'])

    def build_customer_profiles(self):
        """Build comprehensive customer profiles"""
        # Initialize profiles with basic customer info
        for _, customer in self.customers_df.iterrows():
            self.customer_profiles[customer['CustomerID']] = {
                'region': customer['Region'],
                'signup_date': customer['SignupDate'],
                'total_spent': 0,
                'transaction_count': 0,
                'avg_transaction_value': 0,
                'product_categories': {},
                'products': {}
            }

        # Process transactions
        for _, transaction in self.transactions_df.iterrows():
            cust_id = transaction['CustomerID']
            profile = self.customer_profiles[cust_id]

            # Update transaction metrics
            profile['total_spent'] += transaction['TotalValue']
            profile['transaction_count'] += 1

            # Track product purchases
            if transaction['ProductID'] not in profile['products']:
                profile['products'][transaction['ProductID']] = 0
            profile['products'][transaction['ProductID']] += transaction['Quantity']

        # Calculate average transaction value
        for profile in self.customer_profiles.values():
            if profile['transaction_count'] > 0:
                profile['avg_transaction_value'] = profile['total_spent'] / profile['transaction_count']

        # Add product categories
        for cust_id, profile in self.customer_profiles.items():
            category_counts = {}
            for prod_id, quantity in profile['products'].items():
                category = self.products_df[
                    self.products_df['ProductID'] == prod_id
                ]['Category'].iloc[0]

                if category not in category_counts:
                    category_counts[category] = 0
                category_counts[category] += quantity

            profile['product_categories'] = category_counts

    def calculate_similarity(self, profile1, profile2):
        """Calculate similarity score between two customer profiles"""
        # 1. Spending pattern similarity (30% weight)
        if profile1['transaction_count'] == 0 or profile2['transaction_count'] == 0:
            spending_score = 0
        else:
            spending_diff = abs(profile1['avg_transaction_value'] - profile2['avg_transaction_value'])
            max_spending = max(profile1['avg_transaction_value'], profile2['avg_transaction_value'])
            spending_score = 1 - (spending_diff / max_spending if max_spending > 0 else 0)

        # 2. Transaction frequency similarity (20% weight)
        freq_diff = abs(profile1['transaction_count'] - profile2['transaction_count'])
        max_freq = max(profile1['transaction_count'], profile2['transaction_count'])
        freq_score = 1 - (freq_diff / max_freq if max_freq > 0 else 0)

        # 3. Category preference similarity (50% weight)
        all_categories = set(profile1['product_categories'].keys()) | set(profile2['product_categories'].keys())
        if not all_categories:
            category_score = 0
        else:
            matching_categories = 0
            for category in all_categories:
                count1 = profile1['product_categories'].get(category, 0)
                count2 = profile2['product_categories'].get(category, 0)
                if count1 > 0 and count2 > 0:
                    matching_categories += 1
            category_score = matching_categories / len(all_categories)

        # Calculate weighted similarity score
        similarity_score = (
            spending_score * 0.3 +
            freq_score * 0.2 +
            category_score * 0.5
        )

        return round(similarity_score, 4)

    def find_lookalikes(self, customer_id, n_recommendations=3):
        """Find top n similar customers for a given customer"""
        target_profile = self.customer_profiles[customer_id]
        similarities = []

        for other_id, other_profile in self.customer_profiles.items():
            if other_id != customer_id:
                similarity = self.calculate_similarity(target_profile, other_profile)
                similarities.append((other_id, similarity))

        # Sort by similarity score and get top n
        top_n = sorted(similarities, key=lambda x: x[1], reverse=True)[:n_recommendations]
        return [{'customer_id': cid, 'similarity_score': score} for cid, score in top_n]

    def generate_lookalike_recommendations(self):
        """Generate lookalike recommendations for first 20 customers"""
        recommendations = {}

        for cust_id in sorted(self.customer_profiles.keys())[:20]:  # First 20 customers
            recommendations[cust_id] = self.find_lookalikes(cust_id)

        return recommendations

    def save_recommendations_to_csv(self, recommendations, output_file='Lookalike.csv'):
        """Save recommendations to CSV file"""
        rows = []
        for cust_id, lookalikes in recommendations.items():
            lookalike_str = json.dumps(lookalikes)  # Convert list to string
            rows.append({'customer_id': cust_id, 'lookalikes': lookalike_str})

        pd.DataFrame(rows).to_csv(output_file, index=False)

def main():
    # Initialize and run the model
    model = LookalikeModel()

    print("Loading data...")
    model.load_data()

    print("Building customer profiles...")
    model.build_customer_profiles()

    print("Generating lookalike recommendations...")
    recommendations = model.generate_lookalike_recommendations()

    print("Saving recommendations to CSV...")
    model.save_recommendations_to_csv(recommendations)

    print("Process complete!, saved into Lookalike.csv")

    # Print sample recommendations
    print("\nSample recommendations:")
    for cust_id, recs in list(recommendations.items())[:5]:
        print(f"\nCustomer {cust_id}:")
        for rec in recs:
            print(f"  - Similar to {rec['customer_id']} (score: {rec['similarity_score']})")

if __name__ == "__main__":
    main()

Loading data...
Building customer profiles...
Generating lookalike recommendations...
Saving recommendations to CSV...
Process complete!, saved into Lookalike.csv

Sample recommendations:

Customer C0001:
  - Similar to C0152 (score: 0.9972)
  - Similar to C0174 (score: 0.9586)
  - Similar to C0035 (score: 0.9493)

Customer C0002:
  - Similar to C0134 (score: 0.9207)
  - Similar to C0062 (score: 0.9056)
  - Similar to C0133 (score: 0.8937)

Customer C0003:
  - Similar to C0166 (score: 0.9648)
  - Similar to C0026 (score: 0.9304)
  - Similar to C0195 (score: 0.9295)

Customer C0004:
  - Similar to C0012 (score: 0.9437)
  - Similar to C0001 (score: 0.9243)
  - Similar to C0065 (score: 0.922)

Customer C0005:
  - Similar to C0197 (score: 0.9844)
  - Similar to C0007 (score: 0.9366)
  - Similar to C0095 (score: 0.9174)
