In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix


In [3]:
# Load customer and transaction data
customers_df = pd.read_csv('Customers.csv')
transactions_df = pd.read_csv('Transactions.csv')

# There are no missing values based on the EDA

# Merge customer data with transaction data for analysis
customer_transactions_df = pd.merge(transactions_df, customers_df, on='CustomerID')

In [5]:
# Create a user-item interaction matrix (Customer-Product interactions)
product_matrix = pd.pivot_table(customer_transactions_df,
                                index='CustomerID',
                                columns='ProductID',
                                values='Quantity',
                                aggfunc='sum',
                                fill_value=0)

# Normalize the data if necessary (e.g., min-max scaling or z-score)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
product_matrix_scaled = scaler.fit_transform(product_matrix)

# Combine customer demographic data with transaction features
customer_features = customers_df[['CustomerID', 'CustomerName', 'SignupDate', 'Region']]


In [7]:
#Similarity Calculation
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity between customers based on product interactions
similarity_matrix = cosine_similarity(product_matrix_scaled)

# Convert the similarity matrix to a DataFrame for easier interpretation
similarity_df = pd.DataFrame(similarity_matrix,
                             index=product_matrix.index,
                             columns=product_matrix.index)


In [8]:
# Create a function to get the top N lookalike customers for a given customer
def get_top_lookalikes(customer_id, top_n=3):
    # Get the similarity scores for the given customer
    similarity_scores = similarity_df[customer_id]

    # Sort the customers by similarity score (excluding the customer itself)
    similar_customers = similarity_scores.drop(customer_id).sort_values(ascending=False).head(top_n)

    return similar_customers

# Generate the Lookalike recommendations for customers C0001 - C0020
lookalike_dict = {}
for customer_id in customers_df['CustomerID'][:20]:  # First 20 customers (C0001 to C0020)
    lookalike_dict[customer_id] = get_top_lookalikes(customer_id)

# Convert the lookalike dictionary into a DataFrame for easy export
lookalike_list = []
for customer_id, lookalikes in lookalike_dict.items():
    for similar_customer, score in lookalikes.items():
        lookalike_list.append([customer_id, similar_customer, score])

lookalike_df = pd.DataFrame(lookalike_list, columns=['CustomerID', 'LookalikeCustomerID', 'SimilarityScore'])

# Save to CSV
lookalike_df.to_csv('Lookalike.csv', index=False)


In [12]:
lookalike_df = pd.read_csv('Lookalike.csv')
lookalike_df.head(19)

Unnamed: 0,CustomerID,LookalikeCustomerID,SimilarityScore
0,C0001,C0194,0.404928
1,C0001,C0104,0.374002
2,C0001,C0020,0.366609
3,C0002,C0030,0.404617
4,C0002,C0091,0.383778
5,C0002,C0071,0.320158
6,C0003,C0181,0.477572
7,C0003,C0134,0.471016
8,C0003,C0144,0.4238
9,C0004,C0070,0.351901
