In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import tensorflow as tf
from scipy.sparse.linalg import svds

2025-01-28 15:47:44.557892: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1738059464.577375   96407 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1738059464.582167   96407 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-28 15:47:44.599505: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
class LookalikeModel:
    def __init__(self):
        self.transactions_df = pd.read_csv('Transactions.csv')
        self.products_df = pd.read_csv('Products.csv')
        self.customers_df = pd.read_csv('Customers.csv')
        self.scaler = StandardScaler()
        
        # Split transactions into train and test sets (80-20 split)
        cutoff_date = pd.Timestamp('2024-10-01')  # Using Oct 1, 2024 as cutoff
        self.train_transactions = self.transactions_df[
            pd.to_datetime(self.transactions_df['TransactionDate']) < cutoff_date
        ]
        self.test_transactions = self.transactions_df[
            pd.to_datetime(self.transactions_df['TransactionDate']) >= cutoff_date
        ]
        
    def create_customer_profiles(self, transactions_df=None):
        """
        Enhanced customer profile creation with better features and weights
        """
        if transactions_df is None:
            transactions_df = self.train_transactions
            
        # Merge transactions with products
        trans_prod = transactions_df.merge(self.products_df, on='ProductID')
        
        # Convert datetime columns
        trans_prod['TransactionDate'] = pd.to_datetime(trans_prod['TransactionDate'])
        self.customers_df['SignupDate'] = pd.to_datetime(self.customers_df['SignupDate'])
        
        # Calculate time-based features
        current_date = pd.Timestamp('2024-12-31')
        trans_prod['days_since_signup'] = (trans_prod['TransactionDate'] - 
                                         pd.to_datetime(self.customers_df.set_index('CustomerID')
                                         .loc[trans_prod['CustomerID'], 'SignupDate'].values)).dt.days
        
        # Calculate price segments and purchase behavior
        trans_prod['avg_price'] = trans_prod['TotalValue'] / trans_prod['Quantity']
        trans_prod['price_segment'] = pd.qcut(trans_prod['avg_price'], q=5, labels=['very_low', 'low', 'medium', 'high', 'very_high'])
        
        # Enhanced RFM with time weights
        customer_rfm = trans_prod.groupby('CustomerID').agg({
            'TransactionDate': lambda x: np.exp(-0.01 * (current_date - x.max()).days),  # Exponential decay for recency
            'TransactionID': 'count',
            'TotalValue': 'sum',
            'days_since_signup': 'max'  # Customer age
        }).rename(columns={
            'TransactionDate': 'recency_score',
            'TransactionID': 'frequency',
            'TotalValue': 'monetary',
            'days_since_signup': 'customer_age'
        })
        
        # Normalize monetary value by customer age
        customer_rfm['monetary_per_day'] = customer_rfm['monetary'] / customer_rfm['customer_age'].clip(lower=1)
        
        # Purchase patterns with price segments
        purchase_patterns = trans_prod.groupby('CustomerID').agg({
            'Quantity': ['sum', 'mean', 'std'],
            'TotalValue': ['mean', 'std'],
            'avg_price': ['mean', 'std', 'min', 'max']
        })
        purchase_patterns.columns = [f"{col[0]}_{col[1]}" for col in purchase_patterns.columns]
        
        # Price segment preferences
        price_segments = pd.get_dummies(trans_prod[['CustomerID', 'price_segment']]
                                      .set_index('CustomerID')).groupby('CustomerID').mean()
        
        # Category preferences with value weights
        category_totals = trans_prod.groupby('Category')['TotalValue'].sum()
        category_weights = 1 / np.log1p(category_totals)  # Inverse log weighting
        
        # Create category features
        category_features = []
        for category in trans_prod['Category'].unique():
            # Total value per category
            cat_value = trans_prod[trans_prod['Category'] == category].groupby('CustomerID')['TotalValue'].sum()
            cat_value = cat_value * category_weights[category]
            cat_value.name = f'weighted_value_{category}'
            category_features.append(cat_value)
            
            # Quantity per category
            cat_qty = trans_prod[trans_prod['Category'] == category].groupby('CustomerID')['Quantity'].sum()
            cat_qty.name = f'quantity_{category}'
            category_features.append(cat_qty)
        
        category_df = pd.concat(category_features, axis=1).fillna(0)
        
        # Customer demographics with region similarity
        region_dummies = pd.get_dummies(
            self.customers_df[['CustomerID', 'Region']], 
            columns=['Region'],
            prefix='region'
        ).set_index('CustomerID')
        
        # Combine all features
        customer_profiles = (
            customer_rfm
            .join(purchase_patterns)
            .join(price_segments)
            .join(category_df)
            .join(region_dummies)
        ).fillna(0)
        
        # Feature scaling
        scaler = StandardScaler()
        scaled_features = scaler.fit_transform(customer_profiles)
        
        return pd.DataFrame(
            scaled_features,
            index=customer_profiles.index,
            columns=customer_profiles.columns
        )
    
    def create_collaborative_filtering_matrix(self):
        print("Creating user-item matrix...")
        user_item_matrix = pd.pivot_table(
            self.transactions_df,
            values='Quantity',
            index='CustomerID',
            columns='ProductID',
            aggfunc='sum',
            fill_value=0
        )
        
        print(f"User-item matrix shape: {user_item_matrix.shape}")
        
        # Convert to float type
        user_item_matrix = user_item_matrix.astype(np.float32)
        
        # Calculate number of components
        k = min(50, min(user_item_matrix.shape) - 1)
        print(f"Computing SVD with {k} components...")
        
        # Apply SVD
        U, sigma, Vt = svds(user_item_matrix.values, k=k)
        sigma = np.diag(sigma)
        
        # Transform user profiles to latent space
        user_latent_features = np.dot(U, np.sqrt(sigma))
        return pd.DataFrame(user_latent_features, index=user_item_matrix.index)
    
    def get_lookalikes(self, customer_profiles, target_customers, n_recommendations=3):
        """
        Enhanced lookalike model with multiple similarity measures
        """
        # Split features into different aspects
        cols = customer_profiles.columns
        rfm_cols = [col for col in cols if any(x in col.lower() for x in ['recency', 'frequency', 'monetary'])]
        price_cols = [col for col in cols if any(x in col.lower() for x in ['price', 'value'])]
        category_cols = [col for col in cols if 'TotalValue_' in col]
        region_cols = [col for col in cols if 'region_' in col]
        
        # Calculate similarity matrices for different aspects
        rfm_sim = cosine_similarity(customer_profiles[rfm_cols])
        price_sim = cosine_similarity(customer_profiles[price_cols])
        category_sim = cosine_similarity(customer_profiles[category_cols])
        region_sim = cosine_similarity(customer_profiles[region_cols])
        
        # Combine similarities with weights
        similarity_matrix = (
            0.3 * rfm_sim +
            0.3 * price_sim +
            0.3 * category_sim +
            0.1 * region_sim
        )
        
        similarity_df = pd.DataFrame(
            similarity_matrix,
            index=customer_profiles.index,
            columns=customer_profiles.index
        )
        
        # Get recommendations
        results = {}
        for customer in target_customers:
            if customer not in similarity_df.index:
                continue
            
            # Get similarity scores excluding self
            scores = similarity_df[customer].sort_values(ascending=False)
            scores = scores[scores.index != customer]
            
            # Get top N recommendations
            top_similar = scores[:n_recommendations]
            results[customer] = [(cust, round(score, 3)) for cust, score in top_similar.items()]
        
        return results

    def save_recommendations(self, recommendations):
        output_rows = []
        for customer, recs in recommendations.items():
            recs_str = ';'.join([f"{cust}:{score}" for cust, score in recs])
            output_rows.append({
                'CustomerID': customer,
                'Lookalikes': recs_str
            })
        
        output_df = pd.DataFrame(output_rows)
        output_df.to_csv('Lookalike.csv', index=False)

    def evaluate_recommendations(self, recommendations, test_transactions):
        """
        Evaluate recommendation accuracy using test data
        Returns precision@k, recall@k, and F1@k for different k values
        """
        # Get actual customer interactions from test data
        actual_interactions = {}
        for customer in recommendations:
            customer_transactions = test_transactions[
                test_transactions['CustomerID'] == customer
            ]
            # Get unique customers who bought the same products
            similar_customers = set()
            for product in customer_transactions['ProductID'].unique():
                product_customers = set(
                    test_transactions[
                        test_transactions['ProductID'] == product
                    ]['CustomerID']
                )
                similar_customers.update(product_customers)
            similar_customers.discard(customer)  # Remove self
            actual_interactions[customer] = similar_customers
            
        # Calculate metrics for different k values
        k_values = [3, 5, 10]
        metrics = {}
        
        for k in k_values:
            total_precision = 0
            total_recall = 0
            total_f1 = 0
            valid_customers = 0
            
            for customer, actual in actual_interactions.items():
                if customer not in recommendations or not actual:
                    continue
                    
                # Get top k recommended customers
                recommended = set([rec[0] for rec in recommendations[customer][:k]])
                
                # Calculate metrics
                if len(recommended) > 0:
                    precision = len(recommended.intersection(actual)) / len(recommended)
                    recall = len(recommended.intersection(actual)) / len(actual)
                    
                    # Calculate F1 score
                    if precision + recall > 0:
                        f1 = 2 * (precision * recall) / (precision + recall)
                    else:
                        f1 = 0.0
                    
                    total_precision += precision
                    total_recall += recall
                    total_f1 += f1
                    valid_customers += 1
            
            # Calculate averages
            if valid_customers > 0:
                metrics[f'precision@{k}'] = total_precision / valid_customers
                metrics[f'recall@{k}'] = total_recall / valid_customers
                metrics[f'f1@{k}'] = total_f1 / valid_customers
            else:
                metrics[f'precision@{k}'] = 0.0
                metrics[f'recall@{k}'] = 0.0
                metrics[f'f1@{k}'] = 0.0
            
        return metrics

    def print_evaluation_results(self, metrics, method_name="Hybrid"):
        """
        Enhanced print function with detailed algorithm names
        """
        if "Traditional" in method_name:
            algo_details = (
                "Traditional Method (Weighted Combination):\n"
                "- RFM Similarity (30%)\n"
                "- Price-based Similarity (30%)\n"
                "- Category Preference Similarity (30%)\n"
                "- Regional Similarity (10%)"
            )
        else:
            algo_details = (
                "Hybrid ML Method:\n"
                "- Traditional Features (40%)\n"
                "- Matrix Factorization/SVD (30%)\n"
                "- Neural Network Embeddings (20%)\n"
                "- K-Means Clustering (10%)"
            )
        
        print(f"\nEvaluation Results for {algo_details}")
        print("=" * 50)
        
        k_values = [3, 5, 10]
        for k in k_values:
            print(f"\nFor k = {k}:")
            print("-" * 20)
            print(f"Precision: {metrics[f'precision@{k}']:.3f}")
            print(f"Recall   : {metrics[f'recall@{k}']:.3f}")
            print(f"F1 Score : {metrics[f'f1@{k}']:.3f}")

    def create_hybrid_recommendations(self, customer_profiles, target_customers, n_recommendations=3):
        """
        Create recommendations using multiple ML approaches and combine them
        """
        results = {}
        
        # 1. Matrix Factorization using SVD
        user_item_matrix = pd.pivot_table(
            self.train_transactions,
            values='TotalValue',
            index='CustomerID',
            columns='ProductID',
            aggfunc='sum',
            fill_value=0
        )
        
        # Normalize the matrix
        matrix = user_item_matrix.values
        user_ratings_mean = np.mean(matrix, axis=1)
        matrix_normalized = matrix - user_ratings_mean.reshape(-1, 1)
        
        # SVD
        U, sigma, Vt = svds(matrix_normalized, k=20)  # Reduced dimensions
        sigma = np.diag(sigma)
        user_features = np.dot(U, np.sqrt(sigma))
        
        # 2. Neural Network Embeddings
        input_dim = customer_profiles.shape[1]
        inputs = tf.keras.Input(shape=(input_dim,))
        x = tf.keras.layers.Dense(64, activation='relu')(inputs)
        x = tf.keras.layers.Dropout(0.3)(x)
        x = tf.keras.layers.Dense(32, activation='relu')(x)
        x = tf.keras.layers.Dropout(0.2)(x)
        outputs = tf.keras.layers.Dense(16, activation='relu')(x)
        
        embedding_model = tf.keras.Model(inputs=inputs, outputs=outputs)
        customer_embeddings = embedding_model.predict(customer_profiles.values)
        
        # 3. Clustering
        kmeans = KMeans(n_clusters=10, random_state=42)
        clusters = kmeans.fit_predict(customer_profiles)
        cluster_df = pd.DataFrame({'cluster': clusters}, index=customer_profiles.index)
        
        # Combine features for final similarity calculation
        combined_features = np.hstack([
            customer_profiles.values * 0.4,  # Traditional features
            user_features * 0.3,            # Matrix factorization features
            customer_embeddings * 0.2,      # Neural embeddings
            np.eye(len(clusters))[clusters] * 0.1  # One-hot encoded clusters
        ])
        
        # Calculate final similarity
        similarity_matrix = cosine_similarity(combined_features)
        similarity_df = pd.DataFrame(
            similarity_matrix,
            index=customer_profiles.index,
            columns=customer_profiles.index
        )
        
        # Get recommendations
        for customer in target_customers:
            if customer not in similarity_df.index:
                continue
            
            # Get similarity scores excluding self
            scores = similarity_df[customer].sort_values(ascending=False)
            scores = scores[scores.index != customer]
            
            # Get top N recommendations
            top_similar = scores[:n_recommendations]
            results[customer] = [(cust, round(score, 3)) for cust, score in top_similar.items()]
        
        return results

In [3]:
def main():
    model = LookalikeModel()
    customer_profiles = model.create_customer_profiles()
    target_customers = [f'C{str(i).zfill(4)}' for i in range(1, 21)]
    
    print("\nGenerating Recommendations...")
    trad_recommendations = model.get_lookalikes(customer_profiles, target_customers)
    trad_metrics = model.evaluate_recommendations(trad_recommendations, model.test_transactions)
    model.print_evaluation_results(trad_metrics, "Traditional")
    
    hybrid_recommendations = model.create_hybrid_recommendations(customer_profiles, target_customers)
    hybrid_metrics = model.evaluate_recommendations(hybrid_recommendations, model.test_transactions)
    model.print_evaluation_results(hybrid_metrics, "Hybrid")
    
    # Print example recommendations
    print("\nExample Recommendations:")
    print("=" * 50)
    for customer in target_customers[:3]:
        print(f"\nCustomer {customer}:")
        print("Cosine Similarity with Feature Weights:")
        if customer in trad_recommendations:
            for rec_customer, score in trad_recommendations[customer]:
                print(f"  - {rec_customer} (similarity: {score:.3f})")
        print("\nEnsemble of ML Models:")
        if customer in hybrid_recommendations:
            for rec_customer, score in hybrid_recommendations[customer]:
                print(f"  - {rec_customer} (similarity: {score:.3f})")

if __name__ == "__main__":
    main()


Generating Recommendations...

Evaluation Results for Traditional Method (Weighted Combination):
- RFM Similarity (30%)
- Price-based Similarity (30%)
- Category Preference Similarity (30%)
- Regional Similarity (10%)

For k = 3:
--------------------
Precision: 0.051
Recall   : 0.049
F1 Score : 0.046

For k = 5:
--------------------
Precision: 0.051
Recall   : 0.049
F1 Score : 0.046

For k = 10:
--------------------
Precision: 0.051
Recall   : 0.049
F1 Score : 0.046
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step 


W0000 00:00:1738059468.633384   96407 gpu_device.cc:2344] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...



Evaluation Results for Hybrid ML Method:
- Traditional Features (40%)
- Matrix Factorization/SVD (30%)
- Neural Network Embeddings (20%)
- K-Means Clustering (10%)

For k = 3:
--------------------
Precision: 0.051
Recall   : 0.026
F1 Score : 0.035

For k = 5:
--------------------
Precision: 0.051
Recall   : 0.026
F1 Score : 0.035

For k = 10:
--------------------
Precision: 0.051
Recall   : 0.026
F1 Score : 0.035

Example Recommendations:

Customer C0001:
Cosine Similarity with Feature Weights:
  - C0181 (similarity: 0.811)
  - C0069 (similarity: 0.740)
  - C0077 (similarity: 0.717)

Ensemble of ML Models:
  - C0051 (similarity: 0.919)
  - C0184 (similarity: 0.818)
  - C0194 (similarity: 0.739)

Customer C0002:
Cosine Similarity with Feature Weights:
  - C0097 (similarity: 0.814)
  - C0128 (similarity: 0.798)
  - C0076 (similarity: 0.736)

Ensemble of ML Models:
  - C0138 (similarity: 0.848)
  - C0117 (similarity: 0.777)
  - C0079 (similarity: 0.776)

Customer C0003:
Cosine Similarity