In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

In [7]:
class CustomerLookalikeModel:
    def __init__(self):
        # Initialize model components
        self.scaler = StandardScaler()
        self.feature_matrix = None
        self.customer_ids = None
        self.features = None
        
    def load_data(self):
        """Load and preprocess all required data"""
        # Load datasets
        self.customers_df = pd.read_csv('Customers.csv')
        self.products_df = pd.read_csv('Products.csv')
        self.transactions_df = pd.read_csv('Transactions.csv')
        
        # Convert dates
        self.customers_df['SignupDate'] = pd.to_datetime(self.customers_df['SignupDate'])
        self.transactions_df['TransactionDate'] = pd.to_datetime(self.transactions_df['TransactionDate'])
    
    def create_customer_features(self):
        """Create comprehensive customer features from all available data"""
        # Basic customer features
        customer_features = self.customers_df.copy()
        
        # Transaction-based features
        transaction_features = self.transactions_df.groupby('CustomerID').agg({
            'TransactionID': 'count',
            'TotalValue': ['sum', 'mean', 'std'],
            'Quantity': ['sum', 'mean', 'std'],
            'TransactionDate': ['min', 'max']
        }).reset_index()
        
        # Flatten column names
        transaction_features.columns = [
            'CustomerID', 'total_transactions', 
            'total_spend', 'avg_transaction_value', 'std_transaction_value',
            'total_items', 'avg_items_per_transaction', 'std_items_per_transaction',
            'first_purchase_date', 'last_purchase_date'
        ]
        
        # Calculate customer lifetime
        transaction_features['first_purchase_date'] = pd.to_datetime(transaction_features['first_purchase_date'])
        transaction_features['last_purchase_date'] = pd.to_datetime(transaction_features['last_purchase_date'])
        transaction_features['customer_lifetime_days'] = (
            transaction_features['last_purchase_date'] - transaction_features['first_purchase_date']
        ).dt.days
        
        # Category preferences
        category_preferences = (
            self.transactions_df
            .merge(self.products_df[['ProductID', 'Category']], on='ProductID')
            .groupby(['CustomerID', 'Category'])
            .agg({'TransactionID': 'count'})
            .reset_index()
            .pivot(index='CustomerID', columns='Category', values='TransactionID')
            .fillna(0)
        )
        category_preferences.columns = [f'purchases_{col.lower()}' for col in category_preferences.columns]
        
        # Price tier preferences
        self.products_df['price_tier'] = pd.qcut(self.products_df['Price'], q=3, labels=['low', 'medium', 'high'])
        price_preferences = (
            self.transactions_df
            .merge(self.products_df[['ProductID', 'price_tier']], on='ProductID')
            .groupby(['CustomerID', 'price_tier'])
            .agg({'TransactionID': 'count'})
            .reset_index()
            .pivot(index='CustomerID', columns='price_tier', values='TransactionID')
            .fillna(0)
        )
        price_preferences.columns = [f'price_tier_{col}' for col in price_preferences.columns]
        
        # Merge all features
        self.features = (
            customer_features
            .merge(transaction_features, on='CustomerID', how='left')
            .merge(category_preferences, on='CustomerID', how='left')
            .merge(price_preferences, on='CustomerID', how='left')
        )
        
        # Fill NaN values
        self.features = self.features.fillna(0)
        
        return self.features
    
    def prepare_feature_matrix(self):
        """Prepare the final feature matrix for similarity calculation"""
        # Select and prepare features for similarity calculation
        numerical_features = [
            'total_transactions', 'total_spend', 'avg_transaction_value', 
            'std_transaction_value', 'total_items', 'avg_items_per_transaction',
            'std_items_per_transaction', 'customer_lifetime_days'
        ] + [col for col in self.features.columns if col.startswith(('purchases_', 'price_tier_'))]
        
        # Store customer IDs
        self.customer_ids = self.features['CustomerID'].values
        
        # Create and scale feature matrix
        feature_matrix = self.features[numerical_features].copy()
        self.feature_matrix = self.scaler.fit_transform(feature_matrix)
        
        return self.feature_matrix
    
    def find_lookalikes(self, customer_id, n_recommendations=3):
        """Find top n similar customers for a given customer ID"""
        # Get customer index
        customer_idx = np.where(self.customer_ids == customer_id)[0][0]
        
        # Calculate similarity scores
        similarity_scores = cosine_similarity(
            self.feature_matrix[customer_idx].reshape(1, -1),
            self.feature_matrix
        )[0]
        
        # Get top similar customers (excluding the customer itself)
        similar_indices = np.argsort(similarity_scores)[::-1][1:n_recommendations+1]
        
        # Create recommendations list
        recommendations = [
            (self.customer_ids[idx], similarity_scores[idx])
            for idx in similar_indices
        ]
        
        return recommendations

In [8]:
# Initialize and train model
model = CustomerLookalikeModel()
model.load_data()
model.create_customer_features()
model.prepare_feature_matrix()

array([[ 0.        , -0.05188436, -0.05478053, ...,  0.24704743,
        -0.46291005,  0.21161827],
       [-0.45129368, -0.86271433, -0.9039848 , ...,  0.24704743,
        -0.46291005, -0.53090197],
       [-0.45129368, -0.393842  , -0.01157526, ...,  0.24704743,
         0.3086067 , -1.27342221],
       ...,
       [-1.35388105, -1.36869358, -0.90303305, ..., -0.53723012,
        -1.2344268 , -0.53090197],
       [-0.45129368, -0.79937112, -0.78342303, ...,  0.24704743,
        -0.46291005, -0.53090197],
       [ 0.        ,  0.71127787,  1.1072471 , ..., -0.53723012,
         0.3086067 ,  0.21161827]])

In [9]:
# Generate recommendations for first 20 customers
results = []
first_20_customers = model.customers_df['CustomerID'].iloc[:20]

for customer_id in first_20_customers:
    lookalikes = model.find_lookalikes(customer_id)
    
    # Format results
    results.append({
        'customer_id': customer_id,
        'lookalike_1': lookalikes[0][0],
        'score_1': round(lookalikes[0][1], 4),
        'lookalike_2': lookalikes[1][0],
        'score_2': round(lookalikes[1][1], 4),
        'lookalike_3': lookalikes[2][0],
        'score_3': round(lookalikes[2][1], 4)
    })


In [10]:
# Create and save Lookalike.csv
lookalike_df = pd.DataFrame(results)
lookalike_df.to_csv('Dasappagari_Sreenivas_Lookalike.csv', index=False)