Task 2: Lookalike Model

Step 1: Data Preprocessing

In [5]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Load CSV files into DataFrames
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')

# Merge customer and transaction data
customer_transactions = pd.merge(
    customers, 
    transactions.groupby('CustomerID').agg({'TotalValue': 'sum', 'Quantity': 'sum'}), 
    on='CustomerID', 
    how='left'
)

# Encode categorical features (e.g., Region)
customer_transactions = pd.get_dummies(customer_transactions, columns=['Region'])

# Display the merged customer transactions data
customer_transactions.head()


Unnamed: 0,CustomerID,CustomerName,SignupDate,TotalValue,Quantity,Region_Asia,Region_Europe,Region_North America,Region_South America
0,C0001,Lawrence Carroll,2022-07-10,3354.52,12.0,False,False,False,True
1,C0002,Elizabeth Lutz,2022-02-13,1862.74,10.0,True,False,False,False
2,C0003,Michael Rivera,2024-03-07,2725.38,14.0,False,False,False,True
3,C0004,Kathleen Rodriguez,2022-10-09,5354.88,23.0,False,False,False,True
4,C0005,Laura Weber,2022-08-15,2034.24,7.0,True,False,False,False


Step 2: Similarity Calculation

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Load the CSV files into DataFrames
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')

# Group transactions by CustomerID and aggregate TotalValue and Quantity
transaction_summary = transactions.groupby('CustomerID').agg({'TotalValue': 'sum', 'Quantity': 'sum'}).reset_index()

# Merge customer data with transaction data
customer_transactions = pd.merge(customers, transaction_summary, on='CustomerID', how='left')

# Encode categorical features (e.g., Region) using one-hot encoding
customer_transactions = pd.get_dummies(customer_transactions, columns=['Region'])

# Ensure only numeric columns are included for cosine similarity calculation
numeric_columns = customer_transactions.select_dtypes(include=['float64', 'int64']).columns
customer_transactions_numeric = customer_transactions[numeric_columns]

# Fill NaN values with 0
customer_transactions_numeric = customer_transactions_numeric.fillna(0)

# Compute cosine similarity
similarity_matrix = cosine_similarity(customer_transactions_numeric)

# Get top 3 lookalikes for each of the first 20 customers
lookalike_map = {}

for i in range(20):  # First 20 customers (CustomerID: C0001 - C0020)
    # Get similarity scores for the current customer
    similarity_scores = similarity_matrix[i]
    
    # Get the top 3 most similar customers (exclude the customer itself, similarity score = 1)
    similar_customers = similarity_scores.argsort()[-4:-1]
    lookalike_map[customer_transactions.loc[i, 'CustomerID']] = [
        {'CustomerID': customer_transactions.loc[j, 'CustomerID'], 'Score': similarity_scores[j]} 
        for j in similar_customers
    ]

# Convert the lookalike map to a DataFrame and save as CSV
lookalike_df = pd.DataFrame([(k, v) for k, values in lookalike_map.items() for v in values], columns=['CustomerID', 'Lookalike'])
lookalike_df['Score'] = lookalike_df['Lookalike'].apply(lambda x: x['Score'])
lookalike_df['Lookalike'] = lookalike_df['Lookalike'].apply(lambda x: x['CustomerID'])

lookalike_df.to_csv('Lookalike.csv', index=False)

# Display the lookalike map
lookalike_df.head()


Unnamed: 0,CustomerID,Lookalike,Score
0,C0001,C0093,1.0
1,C0001,C0177,1.0
2,C0001,C0153,1.0
3,C0002,C0034,1.0
4,C0002,C0043,1.0
