In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Step 1: Load the data
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')
products = pd.read_csv('Products.csv')

# Step 2: Data Preprocessing
# Merge datasets to include product and customer details in transactions
transactions = transactions.merge(products, on='ProductID').merge(customers, on='CustomerID')

# Aggregate transaction data to create features for each customer
customer_features = transactions.groupby('CustomerID').agg(
    TotalTransactions=('TransactionID', 'count'),
    TotalQuantity=('Quantity', 'sum'),
    TotalSpent=('TotalValue', 'sum'),
    AvgTransactionValue=('TotalValue', 'mean'),
    DistinctProducts=('ProductID', 'nunique')
).reset_index()

# Merge with customer profile data
customer_features = customer_features.merge(customers, on='CustomerID')

# Encode categorical variables
customer_features['Region'] = customer_features['Region'].astype('category').cat.codes

# Step 3: Normalize features
numerical_columns = ['TotalTransactions', 'TotalQuantity', 'TotalSpent', 'AvgTransactionValue', 'DistinctProducts', 'Region']
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features[numerical_columns])

# Step 4: Compute Similarity Matrix
similarity_matrix = cosine_similarity(scaled_features)

# Step 5: Recommend Top 3 Lookalike Customers for Each of the First 20 Customers
lookalike_results = {}
for idx, customer_id in enumerate(customer_features['CustomerID'][:20]):
    # Get similarity scores for the current customer
    similarity_scores = list(enumerate(similarity_matrix[idx]))

    # Sort by similarity score in descending order and exclude the customer itself
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similarity_scores = [(customer_features['CustomerID'][i], score) for i, score in similarity_scores if customer_features['CustomerID'][i] != customer_id]

    # Get the top 3 most similar customers
    lookalike_results[customer_id] = similarity_scores[:3]

# Step 6: Save Results to CSV
output = []
for customer_id, recommendations in lookalike_results.items():
    output.append({
        'CustomerID': customer_id,
        'Lookalikes': recommendations
    })

# Create a DataFrame and save
lookalike_df = pd.DataFrame(output)
lookalike_df.to_csv('FirstName_LastName_Lookalike.csv', index=False)

# Print the results for validation
print(lookalike_df)

   CustomerID                                         Lookalikes
0       C0001  [(C0137, 0.9917546282770221), (C0107, 0.976682...
1       C0002  [(C0142, 0.9789967016690172), (C0043, 0.956983...
2       C0003  [(C0133, 0.9356636533070146), (C0052, 0.876787...
3       C0004  [(C0113, 0.9825009970124268), (C0102, 0.963342...
4       C0005  [(C0159, 0.9975056636669825), (C0186, 0.985127...
5       C0006  [(C0168, 0.974173291850688), (C0158, 0.9336368...
6       C0007  [(C0135, 0.9853800852512515), (C0140, 0.978872...
7       C0008  [(C0194, 0.9704984907472344), (C0024, 0.958848...
8       C0009  [(C0060, 0.9755107996536551), (C0199, 0.974904...
9       C0010  [(C0121, 0.9553284052315265), (C0199, 0.938264...
10      C0011  [(C0107, 0.9882025194085697), (C0048, 0.988012...
11      C0012  [(C0102, 0.9812343678594674), (C0155, 0.965043...
12      C0013  [(C0099, 0.9844376604649439), (C0188, 0.983575...
13      C0014  [(C0198, 0.9920357736215644), (C0060, 0.979709...
14      C0015  [(C0131, 0