In [2]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Load datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Merge datasets
data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

# Feature Engineering

# Calculate total spending per customer
customer_spending = data.groupby('CustomerID')['TotalValue'].sum().reset_index()
customer_spending.rename(columns={'TotalValue': 'TotalSpending'}, inplace=True)

# Calculate transaction count per customer
customer_transactions = data.groupby('CustomerID')['TransactionID'].nunique().reset_index()
customer_transactions.rename(columns={'TransactionID': 'TransactionCount'}, inplace=True)

# Calculate average transaction value per customer
customer_avg_transaction = data.groupby('CustomerID')['TotalValue'].mean().reset_index()
customer_avg_transaction.rename(columns={'TotalValue': 'AvgTransactionValue'}, inplace=True)

# Calculate preferred product category per customer
customer_category = data.groupby(['CustomerID', 'Category'])['Quantity'].sum().reset_index()
customer_category = customer_category.loc[customer_category.groupby('CustomerID')['Quantity'].idxmax()]
customer_category.rename(columns={'Category': 'PreferredCategory'}, inplace=True)

# Merge all features
customer_features = customers.merge(customer_spending, on='CustomerID') \
                             .merge(customer_transactions, on='CustomerID') \
                             .merge(customer_avg_transaction, on='CustomerID') \
                             .merge(customer_category[['CustomerID', 'PreferredCategory']], on='CustomerID')

# Encode categorical variables
encoder = OneHotEncoder(sparse_output=False)
encoded_region = pd.DataFrame(encoder.fit_transform(customer_features[['Region']]), columns=encoder.get_feature_names_out(['Region']))
encoded_category = pd.DataFrame(encoder.fit_transform(customer_features[['PreferredCategory']]), columns=encoder.get_feature_names_out(['PreferredCategory']))
# Combine encoded features with numerical features
customer_features = pd.concat([customer_features, encoded_region, encoded_category], axis=1)
customer_features.drop(['CustomerName', 'SignupDate', 'Region', 'PreferredCategory'], axis=1, inplace=True)

# Normalize features
scaler = StandardScaler()
normalized_features = scaler.fit_transform(customer_features.drop('CustomerID', axis=1))

# Compute similarity matrix
similarity_matrix = cosine_similarity(normalized_features)

# Create a DataFrame for similarity scores
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])

# Function to get top N similar customers
def get_top_n_similar(customers_df, target_customer_id, n=3):
    if target_customer_id not in customers_df.index:
        return []
    similarity_scores = customers_df[target_customer_id].drop(target_customer_id)
    top_n_customers = similarity_scores.nlargest(n)
    return list(top_n_customers.index), list(top_n_customers.values)

# Generate Lookalike.csv
lookalike_dict = {}
for customer_id in customer_features['CustomerID'].head(20):  # First 20 customers
    similar_customers, scores = get_top_n_similar(similarity_df, customer_id)
    lookalike_dict[customer_id] = list(zip(similar_customers, scores))

# Convert to DataFrame for exporting
lookalike_df = pd.DataFrame.from_dict(lookalike_dict, orient='index', columns=['Lookalike1', 'Lookalike2', 'Lookalike3'])
lookalike_df.to_csv('Lookalike.csv', index_label='CustomerID')
