In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Load datasets
customer_data = pd.read_csv('Customers.csv')
product_data = pd.read_csv('Products.csv')
transaction_data = pd.read_csv('Transactions.csv')

# Merge datasets for comprehensive analysis
merged_data = transaction_data.merge(customer_data, on='CustomerID').merge(product_data, on='ProductID')

# Feature Engineering: Create customer-level features
customer_features = merged_data.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),  # Total amount spent by each customer
    total_transactions=('TransactionID', 'count'),  # Number of transactions per customer
    avg_transaction_value=('TotalValue', 'mean'),  # Average transaction value
    most_common_region=('Region', lambda x: x.mode()[0]),  # Most frequent region
    preferred_category=('Category', lambda x: x.mode()[0])  # Most preferred product category
).reset_index()

# Convert categorical features to one-hot encoding
customer_features = pd.get_dummies(customer_features, columns=['most_common_region', 'preferred_category'])

# Normalize numerical features for similarity calculation
scaler = StandardScaler()
numerical_columns = ['total_spent', 'total_transactions', 'avg_transaction_value']
customer_features[numerical_columns] = scaler.fit_transform(customer_features[numerical_columns])

# Calculate cosine similarity between customers
similarity_matrix = cosine_similarity(customer_features.drop(columns=['CustomerID']))
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])

# Identify top 3 lookalike customers for the first 20 customers
lookalike_results = {}

for customer_id in customer_features['CustomerID'][:20]:  # Focus on the first 20 customers
    similar_customers = similarity_df[customer_id].sort_values(ascending=False).iloc[1:4]  # Exclude self and get top 3
    lookalike_results[customer_id] = [(other_id, score) for other_id, score in similar_customers.items()]

# Convert results to a DataFrame for better readability
lookalike_df = pd.DataFrame({
    'CustomerID': lookalike_results.keys(),
    'Top3_Lookalikes': [str(values) for values in lookalike_results.values()]
})

# Save results to a CSV file
lookalike_df.to_csv('Customer_Lookalikes.csv', index=False)

# Display results
print(lookalike_df)

# Provide a download link for the CSV file
from IPython.display import FileLink
FileLink(r'Customer_Lookalikes.csv')

   CustomerID                                    Top3_Lookalikes
0       C0001  [('C0190', 0.968215451295126), ('C0048', 0.941...
1       C0002  [('C0088', 0.9663574397998078), ('C0134', 0.94...
2       C0003  [('C0052', 0.9847977904024423), ('C0152', 0.92...
3       C0004  [('C0165', 0.9711437718179058), ('C0155', 0.96...
4       C0005  [('C0186', 0.9787905419345103), ('C0146', 0.95...
5       C0006  [('C0168', 0.9732537429499296), ('C0171', 0.95...
6       C0007  [('C0140', 0.9764156563035169), ('C0115', 0.93...
7       C0008  [('C0109', 0.8700104206236068), ('C0139', 0.81...
8       C0009  [('C0010', 0.9760669630706748), ('C0198', 0.95...
9       C0010  [('C0009', 0.9760669630706748), ('C0111', 0.97...
10      C0011  [('C0137', 0.9611944701177748), ('C0169', 0.92...
11      C0012  [('C0104', 0.9659896786844346), ('C0113', 0.92...
12      C0013  [('C0099', 0.9855644363688847), ('C0108', 0.91...
13      C0014  [('C0060', 0.9763044912298495), ('C0151', 0.90...
14      C0015  [('C0036',