Task 2: Lookalike Model
- Build a Lookalike Model that takes a user's information as input and recommends 3 similar customers based on their profile and transaction history. The model should:
  - Use both customer and product information.
  -
  Assign a similarity score to each recommended customer.

In [7]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

In [8]:
# Load the datasets
customers_df = pd.read_csv('Customers.csv')
products_df = pd.read_csv('Products.csv')
transactions_df = pd.read_csv('Transactions.csv')


In [9]:
# Convert dates to datetime
customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])

In [10]:
def create_customer_features():
    """Create comprehensive customer profiles combining all available data"""

    # Calculate transaction-based features
    transaction_features = transactions_df.groupby('CustomerID').agg({
        'TransactionID': 'count',  # Number of transactions
        'Quantity': ['sum', 'mean'],  # Purchase quantity metrics
        'TotalValue': ['sum', 'mean', 'std'],  # Spending metrics
        'Price': ['mean', 'min', 'max']  # Price range preferences
    }).round(2)

    # Flatten column names
    transaction_features.columns = [
        'total_transactions',
        'total_quantity', 'avg_quantity',
        'total_spend', 'avg_spend', 'std_spend',
        'avg_price', 'min_price', 'max_price'
    ]

    # Calculate product category preferences
    category_preferences = transactions_df.merge(
        products_df[['ProductID', 'Category']],
        on='ProductID'
    ).groupby(['CustomerID', 'Category'])['TransactionID'].count().unstack(
        fill_value=0
    )

    # Calculate customer account age in days
    customers_df['account_age'] = (
        pd.Timestamp.now() - customers_df['SignupDate']
    ).dt.days

    # One-hot encode region
    region_dummies = pd.get_dummies(customers_df['Region'], prefix='region')

    # Combine all features
    customer_profiles = customers_df[['CustomerID']].merge(
        transaction_features,
        left_on='CustomerID',
        right_index=True,
        how='left'
    )

    customer_profiles = customer_profiles.merge(
        category_preferences,
        left_on='CustomerID',
        right_index=True,
        how='left'
    )

    customer_profiles = customer_profiles.merge(
        customers_df[['CustomerID', 'account_age']],
        on='CustomerID',
        how='left'
    )

    customer_profiles = customer_profiles.merge(
        region_dummies,
        left_index=True,
        right_index=True,
        how='left'
    )

    # Fill NaN values with 0 for new customers
    customer_profiles = customer_profiles.fillna(0)

    return customer_profiles

In [11]:
def calculate_similarity_scores(customer_profiles):
    """Calculate similarity scores between customers"""

    # Separate CustomerID and features
    customer_ids = customer_profiles['CustomerID']
    features = customer_profiles.drop('CustomerID', axis=1)

    # Scale features
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(features)

    # Calculate cosine similarity
    similarity_matrix = cosine_similarity(scaled_features)

    return pd.DataFrame(
        similarity_matrix,
        index=customer_ids,
        columns=customer_ids
    )

In [12]:
def get_top_lookalikes(customer_id, similarity_df, n=3):
    """Get top N similar customers for a given customer ID"""

    # Get similarity scores for the customer
    customer_similarities = similarity_df[customer_id].sort_values(ascending=False)

    # Remove the customer itself and get top N
    top_similar = customer_similarities[
        customer_similarities.index != customer_id
    ].head(n)

    return pd.DataFrame({
        'similar_customer_id': top_similar.index,
        'similarity_score': top_similar.values
    })

In [17]:
# Create customer profiles and calculate similarity
print("Creating customer profiles...")
customer_profiles = create_customer_features()
print("Calculating similarity scores...")
similarity_df = calculate_similarity_scores(customer_profiles)

# Generate recommendations for customers C0001-C0020
print("Generating recommendations...")
recommendations = {}
for i in range(1, 21):
    customer_id = f'C{i:04d}'
    recommendations[customer_id] = get_top_lookalikes(customer_id, similarity_df)


Creating customer profiles...
Calculating similarity scores...
Generating recommendations...
Creating customer profiles...
Calculating similarity scores...
Generating recommendations...


In [18]:
# Create Lookalike.csv
output_data = []
for customer_id, similar_customers in recommendations.items():
    # Format the recommendations as a list of tuples
    similar_list = list(zip(
        similar_customers['similar_customer_id'],
        similar_customers['similarity_score'].round(4)
    ))
    output_data.append({
        'customer_id': customer_id,
        'similar_customers': str(similar_list)
    })

In [19]:
# Save to CSV
pd.DataFrame(output_data).to_csv('Lookalike.csv', index=False)

# Print sample recommendations
print("\nSample recommendations for first 5 customers:")
for i in range(1, 6):
    customer_id = f'C{i:04d}'
    print(f"\nCustomer {customer_id}:")
    print(recommendations[customer_id])



Sample recommendations for first 5 customers:

Customer C0001:
  similar_customer_id  similarity_score
0               C0192          0.746643
1               C0112          0.743828
2               C0118          0.719128

Customer C0002:
  similar_customer_id  similarity_score
0               C0106          0.822611
1               C0134          0.811088
2               C0043          0.802897

Customer C0003:
  similar_customer_id  similarity_score
0               C0039          0.769776
1               C0190          0.732129
2               C0195          0.728942

Customer C0004:
  similar_customer_id  similarity_score
0               C0113          0.827317
1               C0104          0.743263
2               C0102          0.657385

Customer C0005:
  similar_customer_id  similarity_score
0               C0007          0.864986
1               C0186          0.819754
2               C0128          0.738318


In [25]:
#Download the csv
from google.colab import files
files.download('Lookalike.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [26]:
df = pd.read_csv('/content/Lookalike.csv')

In [27]:
df

Unnamed: 0,customer_id,similar_customers
0,C0001,"[('C0192', 0.7466), ('C0112', 0.7438), ('C0118..."
1,C0002,"[('C0106', 0.8226), ('C0134', 0.8111), ('C0043..."
2,C0003,"[('C0039', 0.7698), ('C0190', 0.7321), ('C0195..."
3,C0004,"[('C0113', 0.8273), ('C0104', 0.7433), ('C0102..."
4,C0005,"[('C0007', 0.865), ('C0186', 0.8198), ('C0128'..."
5,C0006,"[('C0187', 0.757), ('C0168', 0.717), ('C0171',..."
6,C0007,"[('C0005', 0.865), ('C0040', 0.7834), ('C0115'..."
7,C0008,"[('C0098', 0.7917), ('C0194', 0.7869), ('C0065..."
8,C0009,"[('C0103', 0.7356), ('C0198', 0.72), ('C0119',..."
9,C0010,"[('C0062', 0.8317), ('C0111', 0.7511), ('C0061..."
