**TASK 2: Lookalike Model**

In [1]:
#importing the libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from datetime import datetime

In [2]:
# Function to load and merge datasets
def load_and_merge_data():
    """Load customer, product, and transaction data, and merge them."""
    # Load datasets
    customers = pd.read_csv('Customers.csv')
    products = pd.read_csv('Products.csv')
    transactions = pd.read_csv('Transactions.csv')

    # Merge datasets on relevant keys
    transactions = transactions.merge(customers, on='CustomerID', how='left')
    transactions = transactions.merge(products, on='ProductID', how='left')

    return transactions, customers

In [3]:
# Function to perform feature engineering and generate customer profiles
def create_customer_profiles(transactions, customers):
    """Feature engineering: Aggregate transaction data and create customer profiles."""
    # Aggregate transaction data for customer profiles
    customer_profiles = transactions.groupby('CustomerID').agg(
        TotalSpending=('TotalValue', 'sum'),
        TransactionCount=('TransactionID', 'count'),
        AvgTransactionValue=('TotalValue', 'mean'),
        AvgQuantity=('Quantity', 'mean'),
        DaysSinceLastPurchase=('TransactionDate', lambda x: (datetime.now() - pd.to_datetime(x)).dt.days.min())
    ).reset_index()

    # Add product category preferences as additional features
    category_pref = transactions.groupby(['CustomerID', 'Category']).agg(Spending=('TotalValue', 'sum')).unstack(fill_value=0)
    category_pref.columns = [f'Category_{col[1]}_Spending' for col in category_pref.columns]
    customer_profiles = customer_profiles.merge(category_pref, on='CustomerID', how='left')

    # Add region and signup date to profiles
    customer_profiles = customer_profiles.merge(
        customers[['CustomerID', 'Region', 'SignupDate']],
        on='CustomerID',
        how='left'
    )

    # One-hot encode categorical features like 'Region'
    customer_profiles = pd.get_dummies(customer_profiles, columns=['Region'], drop_first=True)

    return customer_profiles


In [4]:

# Function to normalize and scale the customer profile features
def normalize_features(customer_profiles):
    """Normalize the customer profile features."""
    # Exclude non-numeric columns
    features = customer_profiles.drop(columns=['CustomerID', 'SignupDate'])

    # Normalize features using StandardScaler
    scaler = StandardScaler()
    normalized_features = scaler.fit_transform(features)

    return normalized_features, features

In [5]:
# Function to perform KMeans clustering
def perform_clustering(normalized_features, num_clusters=5):
    """Perform KMeans clustering to segment customers into groups."""
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(normalized_features)

    return cluster_labels, kmeans

In [6]:
# Function to calculate similarity scores within clusters
def calculate_similarity(customer_profiles, normalized_features, cluster_labels):
    """Calculate similarity scores for customers within each cluster."""
    similarity_results = {}

    for cluster in np.unique(cluster_labels):
        cluster_data = customer_profiles[customer_profiles['Cluster'] == cluster]
        cluster_ids = cluster_data['CustomerID'].values
        cluster_features = normalized_features[cluster_labels == cluster]

        # Calculate cosine similarity within the cluster
        similarity_matrix = cosine_similarity(cluster_features)

        for idx, customer_id in enumerate(cluster_ids):
            sim_scores = list(enumerate(similarity_matrix[idx]))
            sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
            top_3 = sim_scores[1:4]  # Get top 3 lookalikes
            similarity_results[customer_id] = [
                (cluster_ids[i], round(score, 4)) for i, score in top_3
            ]

    return similarity_results


In [7]:
# Function to generate the Lookalike.csv file
def generate_lookalike_csv(similarity_results):
    """Generate Lookalike.csv with customer similarities."""
    lookalike_data = [
        {'CustomerID': customer_id, 'Lookalikes': lookalikes}
        for customer_id, lookalikes in similarity_results.items()
    ]

    lookalike_df = pd.DataFrame(lookalike_data)
    lookalike_df.to_csv('Shrinath_Asati_Lookalike.csv', index=False)

    return lookalike_df

In [8]:
# Main function
def main():
    # Load and merge data
    transactions, customers = load_and_merge_data()

    # Create customer profiles with aggregated features
    customer_profiles = create_customer_profiles(transactions, customers)

    # Normalize features for clustering
    normalized_features, features = normalize_features(customer_profiles)

    # Perform clustering
    cluster_labels, kmeans = perform_clustering(normalized_features)

    # Add cluster labels to customer profiles
    customer_profiles['Cluster'] = cluster_labels

    # Calculate similarity scores within clusters
    similarity_results = calculate_similarity(customer_profiles, normalized_features, cluster_labels)

    # Generate Lookalike.csv and display the results for the first 20 customers
    lookalike_df = generate_lookalike_csv(similarity_results)
    print(lookalike_df[lookalike_df['CustomerID'].isin([f'C{i:04d}' for i in range(1, 21)])])

if __name__ == "__main__":
    main()


    CustomerID                                         Lookalikes
0        C0004  [(C0113, 0.8769), (C0012, 0.8374), (C0104, 0.8...
1        C0006  [(C0169, 0.7878), (C0153, 0.6957), (C0082, 0.6...
2        C0012  [(C0113, 0.903), (C0104, 0.8887), (C0163, 0.86...
3        C0013  [(C0099, 0.9512), (C0188, 0.9026), (C0165, 0.7...
47       C0001  [(C0181, 0.8495), (C0120, 0.7808), (C0091, 0.7...
48       C0002  [(C0159, 0.8754), (C0106, 0.8588), (C0178, 0.8...
49       C0003  [(C0091, 0.8063), (C0120, 0.7556), (C0129, 0.7...
50       C0005  [(C0123, 0.8356), (C0186, 0.821), (C0140, 0.80...
51       C0007  [(C0140, 0.7812), (C0005, 0.7771), (C0123, 0.5...
52       C0011  [(C0107, 0.7482), (C0190, 0.7382), (C0191, 0.7...
105      C0014  [(C0097, 0.8422), (C0128, 0.8252), (C0060, 0.7...
106      C0020  [(C0080, 0.8264), (C0110, 0.8176), (C0144, 0.8...
119      C0009  [(C0198, 0.9508), (C0119, 0.923), (C0121, 0.87...
120      C0010  [(C0111, 0.9709), (C0132, 0.8978), (C0062, 0.7...
121      C