In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [11]:
def load_data():
    customers = pd.read_csv("Customers.csv")
    products = pd.read_csv("Products.csv")
    transactions = pd.read_csv("Transactions.csv")

    return customers, products, transactions

In [13]:
def preprocess_data(customers, products, transactions):

    customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
    transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

    merged = transactions.merge(products, on='ProductID', how='left')
    merged = merged.merge(customers, on='CustomerID', how='left')

    return merged

In [15]:
def create_features(merged):
    # Aggregate features at the customer level
    customer_features = merged.groupby('CustomerID').agg(
        total_transactions=('TransactionID', 'count'),
        total_spending=('TotalValue', 'sum'),
        avg_spending=('TotalValue', 'mean'),
        most_frequent_category=('Category', lambda x: x.mode()[0] if not x.mode().empty else 'Unknown'),
        region=('Region', 'first')
    ).reset_index()

    customer_features = pd.get_dummies(customer_features, columns=['region', 'most_frequent_category'], drop_first=True)

    return customer_features

In [17]:
def calculate_similarity(features):
    # Standardize features
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(features.iloc[:, 1:])

    # Compute cosine similarity
    similarity_matrix = cosine_similarity(scaled_features)
    return similarity_matrix

In [19]:
def generate_recommendations(similarity_matrix, customer_ids, target_ids):
    recommendations = {}
    
    for i, target_id in enumerate(target_ids):
        # Get similarities for the target customer
        similarities = similarity_matrix[i]
        
        # Exclude the target customer and sort by similarity
        similar_customers = np.argsort(similarities)[::-1][1:4]
        scores = similarities[similar_customers]
        
        # Map customer IDs and scores
        recommendations[target_id] = [(customer_ids[idx], score) for idx, score in zip(similar_customers, scores)]

    return recommendations

In [21]:
def main():
    customers, products, transactions = load_data()
    merged = preprocess_data(customers, products, transactions)
    customer_features = create_features(merged)

    customer_ids = customer_features['CustomerID'].tolist()
    similarity_matrix = calculate_similarity(customer_features)

    target_ids = customer_ids[:20]  # First 20 customers (C0001 - C0020)
    recommendations = generate_recommendations(similarity_matrix, customer_ids, target_ids)

    lookalike_df = pd.DataFrame({
        "CustomerID": target_ids,
        "Recommendations": [
            [
                {"LookalikeID": rec[0], "Score": rec[1]} for rec in recommendations[customer_id]
            ] for customer_id in target_ids
        ]
    })
    lookalike_df.to_csv("Sadvik_Chinthala_Lookalike.csv", index=False)
    

In [23]:
if __name__ == "__main__":
    main()