In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

In [2]:
def load_and_prepare_data():
    customers_df = pd.read_csv('Downloads/Customers.csv')
    products_df = pd.read_csv('Downloads/Products.csv')
    transactions_df = pd.read_csv('Downloads/Transactions.csv')
    
    customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
    transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])
    
    return customers_df, products_df, transactions_df

In [3]:
def create_customer_features(customers_df, transactions_df, products_df):
    # Customer transaction features
    customer_stats = transactions_df.groupby('CustomerID').agg({
        'TransactionID': 'count',  # Number of transactions
        'TotalValue': ['sum', 'mean', 'std'],  # Transaction value metrics
        'Quantity': ['sum', 'mean']  # Purchase quantity metrics
    }).fillna(0)
    
    customer_stats.columns = ['transaction_count', 'total_spend', 'avg_transaction', 
                            'std_transaction', 'total_items', 'avg_items']
    
    latest_date = customers_df['SignupDate'].max()
    customers_df['days_since_signup'] = (latest_date - customers_df['SignupDate']).dt.days
    
    region_dummies = pd.get_dummies(customers_df['Region'], prefix='region')
    
    features_df = pd.concat([
        customer_stats,
        customers_df.set_index('CustomerID')[['days_since_signup']],
        region_dummies
    ], axis=1).fillna(0)
    
    return features_df

In [4]:
def calculate_similarity_scores(features_df):
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features_df)
    
    similarity_matrix = cosine_similarity(features_scaled)
    
    return pd.DataFrame(
        similarity_matrix, 
        index=features_df.index, 
        columns=features_df.index
    )

In [5]:
def get_top_lookalikes(customer_id, similarity_df, n=3):
    customer_similarities = similarity_df[customer_id].sort_values(ascending=False)
    
    customer_similarities = customer_similarities[customer_similarities.index != customer_id]
    top_similar = customer_similarities.head(n)
    return top_similar

In [6]:
def create_lookalike_recommendations(start_id=1, end_id=20):
    customers_df, products_df, transactions_df = load_and_prepare_data()
    features_df = create_customer_features(customers_df, transactions_df, products_df)
    similarity_df = calculate_similarity_scores(features_df)
    
    recommendations = {}
    for i in range(start_id, end_id + 1):
        customer_id = f'C{i:04d}'
        if customer_id in similarity_df.index:
            top_similar = get_top_lookalikes(customer_id, similarity_df)
            recommendations[customer_id] = [
                {
                    'similar_customer': similar_id,
                    'similarity_score': round(score, 4)
                }
                for similar_id, score in top_similar.items()
            ]
    
    return recommendations

In [7]:
def save_recommendations_to_csv(recommendations):
    customer_ids = []
    similar_customers = []
    similarity_scores = []
    
    for customer_id, similars in recommendations.items():
        for rec in similars:
            customer_ids.append(customer_id)
            similar_customers.append(rec['similar_customer'])
            similarity_scores.append(rec['similarity_score'])
    
    results_df = pd.DataFrame({
        'CustomerID': customer_ids,
        'SimilarCustomerID': similar_customers,
        'SimilarityScore': similarity_scores
    })
    results_df.to_csv('Lookalike.csv', index=False)
    return results_df

In [8]:
recommendations = create_lookalike_recommendations(1, 20)
results_df = save_recommendations_to_csv(recommendations)

print("\nSample Recommendations:")
print(results_df.head(15))


Sample Recommendations:
   CustomerID SimilarCustomerID  SimilarityScore
0       C0001             C0106           0.9901
1       C0001             C0153           0.9838
2       C0001             C0011           0.9805
3       C0002             C0199           0.9773
4       C0002             C0025           0.9496
5       C0002             C0166           0.9480
6       C0003             C0142           0.9847
7       C0003             C0052           0.9701
8       C0003             C0190           0.9665
9       C0004             C0145           0.9897
10      C0004             C0102           0.9872
11      C0004             C0175           0.9835
12      C0005             C0131           0.9894
13      C0005             C0007           0.9819
14      C0005             C0132           0.9684
