In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Load data
df_customers = pd.read_csv('Customers.csv')
df_transactions = pd.read_csv('Transactions.csv')
df_products = pd.read_csv('Products.csv')

# Merge data
merged = df_transactions.merge(df_products, on='ProductID').merge(df_customers, on='CustomerID')

# Feature Engineering
# 1. RFM Features
rfm = merged.groupby('CustomerID').agg(
    Recency=('TransactionDate', lambda x: (pd.to_datetime('today') - pd.to_datetime(x).max()).days),
    Frequency=('TransactionID', 'count'),
    Monetary=('TotalValue', 'sum')
).reset_index()

# 2. Product Preferences
category_spend = pd.pivot_table(
    merged,
    index='CustomerID',
    columns='Category',
    values='TotalValue',
    aggfunc='sum',
    fill_value=0
)

# 3. Customer Profile Features
profile_features = df_customers.copy()
profile_features['Tenure'] = (pd.to_datetime('today') - pd.to_datetime(profile_features['SignupDate'])).dt.days
profile_features = pd.get_dummies(profile_features[['CustomerID', 'Region', 'Tenure']], columns=['Region'])

# Combine all features
features = rfm.merge(category_spend, on='CustomerID').merge(profile_features, on='CustomerID')
features = features.set_index('CustomerID')

# Handle missing values
features = features.fillna(0)

# Scale features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

In [5]:
# Compute similarity matrix
similarity_matrix = cosine_similarity(scaled_features)

# Create similarity DataFrame
similarity_df = pd.DataFrame(
    similarity_matrix,
    index=features.index,
    columns=features.index
)

# Generate recommendations
results = {}
target_customers = [f'C{str(i+1).zfill(4)}' for i in range(20)]

for cust_id in target_customers:
    # Get similarities for current customer
    similarities = similarity_df.loc[cust_id].sort_values(ascending=False)
    
    # Remove self and get top 3
    top_matches = similarities.drop(cust_id).head(3).reset_index()
    top_matches.columns = ['SimilarCustomer', 'Score']
    
    # Store results as list of tuples
    results[cust_id] = list(top_matches.itertuples(index=False, name=None))

In [8]:
# Generate recommendations
results = {}
target_customers = [f'C{str(i+1).zfill(4)}' for i in range(20)]

for cust_id in target_customers:
    similarities = similarity_df.loc[cust_id].sort_values(ascending=False)
    top_matches = similarities.drop(cust_id).head(3).reset_index()
    results[cust_id] = list(zip(top_matches['CustomerID'], top_matches[cust_id]))

# Create final output
output = pd.DataFrame({
    'CustomerID': results.keys(),
    'SimilarCustomers': [
        ";".join([f"{cust},{score:.4f}" for cust, score in matches]) 
        for matches in results.values()
    ]
})

# Expand into separate columns
output[['Match1', 'Match2', 'Match3']] = pd.DataFrame(
    output['SimilarCustomers'].apply(
        lambda x: [tuple(item.split(',')) for item in x.split(';')]
    ).tolist()
)

output.to_csv('Lookalike.csv', index=False)

In [10]:
# Validate recommendations
def print_recommendations(customer_id):
    try:
        # Get customer features
        customer_features = features.loc[customer_id]
        
        print(f"\nRecommendations for {customer_id}:")
        print("Core Features:")
        print(f"Frequency: {customer_features['Frequency']}")
        print(f"Monetary: ${customer_features['Monetary']:.2f}")
        print(f"Recency: {customer_features['Recency']} days")
        
        # Find category columns dynamically
        category_cols = [col for col in features.columns if col in df_products['Category'].unique()]
        top_categories = customer_features[category_cols].sort_values(ascending=False).head(3)
        print("\nTop Categories:")
        for category, value in top_categories.items():
            print(f"{category}: ${value:.2f}")
        
        # Print matches
        print("\nTop Matches:")
        for match, score in results.get(customer_id, []):
            try:
                match_features = features.loc[match]
                print(f"\n→ {match} (Score: {score:.3f})")
                print(f"Frequency: {match_features['Frequency']}")
                print(f"Monetary: ${match_features['Monetary']:.2f}")
                print(f"Recency: {match_features['Recency']} days")
                
                # Compare categories
                match_top_cats = match_features[category_cols].sort_values(ascending=False).head(3)
                print("Top Categories:")
                for category, value in match_top_cats.items():
                    print(f"{category}: ${value:.2f}")
                    
            except KeyError:
                print(f"→ {match} not found in features data")
                
    except KeyError:
        print(f"Customer {customer_id} not found in the dataset")

# Example validation for C0001
print_recommendations('C0001')


Recommendations for C0001:
Core Features:
Frequency: 5
Monetary: $3354.52
Recency: 86 days

Top Categories:
Electronics: $2827.30
Home Decor: $412.62
Books: $114.60

Top Matches:

→ C0091 (Score: 0.881)
Frequency: 6
Monetary: $3137.66
Recency: 167 days
Top Categories:
Electronics: $2489.62
Home Decor: $382.76
Clothing: $265.28

→ C0184 (Score: 0.848)
Frequency: 7
Monetary: $3393.18
Recency: 141 days
Top Categories:
Electronics: $1898.74
Clothing: $580.34
Books: $459.57

→ C0120 (Score: 0.846)
Frequency: 3
Monetary: $2470.02
Recency: 172 days
Top Categories:
Electronics: $2470.02
Books: $0.00
Clothing: $0.00
