In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Load datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Data Preprocessing: Merging datasets to get customer, product, and transaction information
# Merge transactions with customer and product data
transactions = pd.merge(transactions, customers[['CustomerID', 'Region']], on='CustomerID', how='left')
transactions = pd.merge(transactions, products[['ProductID', 'ProductName', 'Category', 'Price']], on='ProductID', how='left')

# Feature Engineering: Creating customer profiles based on their transaction history and product purchases
# Customer profile: Aggregating total value and total quantity of products purchased by each customer
customer_profile = transactions.groupby('CustomerID').agg({
    'TotalValue': 'sum',  # Total value of purchases
    'Quantity': 'sum',  # Total quantity of products purchased
}).reset_index()

# One-hot encoding for product categories purchased by each customer
category_dummies = pd.get_dummies(transactions['Category'], prefix='Category')
category_features = transactions[['CustomerID']].join(category_dummies).groupby('CustomerID').sum().reset_index()

# Merging customer profile with product category features to create the complete customer profile
customer_features = pd.merge(customer_profile, category_features, on='CustomerID')

# Normalizing the features using StandardScaler
scaler = StandardScaler()
customer_features_scaled = scaler.fit_transform(customer_features.drop('CustomerID', axis=1))

# Compute Cosine Similarity between customers
cosine_sim = cosine_similarity(customer_features_scaled)

# Function to get top N lookalike customers based on cosine similarity
def get_top_lookalikes(customer_id, n=3):
    customer_index = customer_features[customer_features['CustomerID'] == customer_id].index[0]
    similarity_scores = list(enumerate(cosine_sim[customer_index]))
    sorted_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Fetching the top N similar customers (excluding the customer itself)
    top_customers = [(customer_features.iloc[i[0]]['CustomerID'], i[1]) for i in sorted_scores[1:n+1]]
    return top_customers

# Generating Lookalike Mapping for customers C0001 - C0020
lookalike_map = {}
for customer_id in customers['CustomerID'][:20]:  # First 20 customers (C0001 - C0020)
    lookalikes = get_top_lookalikes(customer_id)
    lookalike_map[customer_id] = lookalikes

# Converting the lookalike map into a DataFrame for output
lookalike_df = pd.DataFrame([
    {'CustomerID': customer_id, 'Lookalikes': str(lookalikes)}
    for customer_id, lookalikes in lookalike_map.items()
])

# Saving the result as 'Lookalike.csv'
lookalike_df.to_csv('FirstName_LastName_Lookalike.csv', index=False)

print("Lookalike model generated and saved to 'FirstName_LastName_Lookalike'")



Lookalike model generated and saved to 'FirstName_LastName_Lookalike'
