In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Load customer and product data
customers_df = pd.read_csv("Customers.csv")
products_df = pd.read_csv("Products.csv")

# Sample transactions (replace with actual transaction data if available)
transactions_df = pd.DataFrame({
    'CustomerID': ['C0001', 'C0001', 'C0002', 'C0002', 'C0003'],
    'ProductID': ['P001', 'P002', 'P001', 'P003', 'P004'],
    'Quantity': [2, 1, 1, 3, 1],
    'TotalAmount': [20, 15, 25, 45, 30]
})

# Merge transactions with product details
merged_data = pd.merge(transactions_df, products_df, on="ProductID", how="left")

# Aggregate transaction history by CustomerID (TotalSpent, NumberOfPurchases)
transaction_agg = merged_data.groupby('CustomerID').agg(
    TotalSpent=('TotalAmount', 'sum'),
    NumberOfPurchases=('ProductID', 'count')
).reset_index()

# Merge with customer data
customer_data = pd.merge(customers_df, transaction_agg, on="CustomerID", how="left")

# Fill missing values
# 1. Fill NaN values in numerical columns with 0 (or another strategy)
customer_data['TotalSpent'].fillna(0, inplace=True)
customer_data['NumberOfPurchases'].fillna(0, inplace=True)

# 2. Fill NaN values in categorical columns (e.g., Region) with 'Unknown'
customer_data['Region'].fillna('Unknown', inplace=True)

# 3. Replace infinite values with a large number or NaN
customer_data.replace([np.inf, -np.inf], np.nan, inplace=True)

# Drop rows with any remaining NaN values
customer_data.dropna(inplace=True)


In [9]:
# One-hot encode the 'Region' column
encoder = OneHotEncoder(sparse=False)
region_encoded = encoder.fit_transform(customer_data[['Region']])

# Create a DataFrame with the encoded features
region_df = pd.DataFrame(region_encoded, columns=encoder.get_feature_names_out(['Region']))

# Combine the encoded region features with the rest of the customer data
customer_data_processed = pd.concat([customer_data[['CustomerID', 'SignupDate']], region_df], axis=1)

# Normalize numerical features (TotalSpent, NumberOfPurchases)
scaler = StandardScaler()
customer_data_processed[['TotalSpent', 'NumberOfPurchases']] = scaler.fit_transform(
    customer_data[['TotalSpent', 'NumberOfPurchases']]
)


In [10]:
# Use only the features that are relevant for the similarity calculation (exclude non-feature columns)
features = customer_data_processed.drop(columns=["CustomerID", "SignupDate"])

# Compute the cosine similarity matrix for the customers
similarity_matrix = cosine_similarity(features)

# Store the recommendations in a dictionary
lookalike_recommendations = {}

# For each customer, get the top 3 similar customers based on similarity scores
for idx, customer_id in enumerate(customer_data_processed['CustomerID']):
    similarities = similarity_matrix[idx]
    
    # Get the top 3 similar customers (excluding itself)
    similar_customers = similarities.argsort()[-4:-1][::-1]  # Excluding the customer itself
    similar_customers_ids = customer_data_processed['CustomerID'].iloc[similar_customers].tolist()
    similarity_scores = similarities[similar_customers].tolist()
    
    lookalike_recommendations[customer_id] = list(zip(similar_customers_ids, similarity_scores))

# Convert the recommendations to a DataFrame and save to CSV
lookalike_df = pd.DataFrame([
    {'CustomerID': k, 'Lookalikes': v} for k, v in lookalike_recommendations.items()
])

lookalike_df.to_csv('Lookalike.csv', index=False)
