In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import numpy as np

# Load the datasets
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')
products = pd.read_csv('Products.csv')

# Merge datasets to create a comprehensive dataset
# Merge transactions with customers

In [2]:
merged_data = transactions.merge(customers, on='CustomerID')

# Merge with products to get product information

In [3]:
merged_data = merged_data.merge(products, on='ProductID')

# Feature Engineering
# Create a pivot table to summarize transaction history

In [4]:
transaction_summary = merged_data.groupby(['CustomerID']).agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'ProductID': 'nunique'
}).reset_index()

# Rename columns for clarity

In [5]:
transaction_summary.rename(columns={
    'TotalValue': 'TotalSpent',
    'Quantity': 'TotalQuantity',
    'ProductID': 'UniqueProducts'
}, inplace=True)

# Merge back with customer data to include customer features

In [6]:
customer_features = customers.merge(transaction_summary, on='CustomerID', how='left').fillna(0)

# Select relevant features for similarity calculation

In [7]:
features = customer_features[['Region', 'TotalSpent', 'TotalQuantity', 'UniqueProducts']]

# One-hot encode categorical features (Region)

In [8]:
features = pd.get_dummies(features, columns=['Region'], drop_first=True)

 #Standardize the features

In [10]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Calculate cosine similarity

In [11]:
similarity_matrix = cosine_similarity(scaled_features)

# Create a DataFrame for similarity scores

In [12]:
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])


# Function to get top 3 lookalikes for a given customer

In [13]:
def get_top_lookalikes(customer_id, n=3):
    # Get the similarity scores for the given customer
    scores = similarity_df[customer_id].sort_values(ascending=False)
    # Exclude the customer itself
    scores = scores[scores.index != customer_id]
    # Get the top n lookalikes
    return scores.head(n)

# Create a dictionary to store the lookalikes

In [14]:
lookalike_results = {}

# Get lookalikes for the first 20 customers (C0001 - C0020)

In [15]:
for customer_id in customer_features['CustomerID'][:20]:
    lookalikes = get_top_lookalikes(customer_id)
    lookalike_results[customer_id] = list(zip(lookalikes.index, lookalikes.values))


# Create a list to store the lookalike results

In [18]:
lookalike_list = []

# Get lookalikes for the first 20 customers (C0001 - C0020)
for customer_id in customer_features['CustomerID'][:20]:
    lookalikes = get_top_lookalikes(customer_id)
    for lookalike_id, score in zip(lookalikes.index, lookalikes.values):
        lookalike_list.append({
            'CustomerID': customer_id,
            'LookalikeID': lookalike_id,
            'SimilarityScore': score
        })

# Convert the results to a DataFrame
lookalike_df = pd.DataFrame(lookalike_list)

# Save the results to a CSV file
lookalike_df.to_csv('Lookalike.csv', index=False)

print("Lookalike recommendations saved to Lookalike.csv")

Lookalike recommendations saved to Lookalike.csv


# Save the results to a CSV file

In [19]:

lookalike_df.to_csv('Lookalike.csv', index=False)

print("Lookalike recommendations saved to Lookalike.csv")

Lookalike recommendations saved to Lookalike.csv
