In [29]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


step 1: Import Required Libraries

Start by importing the necessary libraries.

In [30]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


Step 2: Load Data

Load the three datasets into Pandas DataFrames.

In [31]:
# Load the datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Check the first few rows of each dataset to understand their structure
print(customers.head())
print(products.head())
print(transactions.head())


  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3       

Step 3: Data Preprocessing

Before building the model, clean and merge the datasets.

    Merge Customers.csv and Transactions.csv on CustomerID.
    Merge the resulting DataFrame with Products.csv on ProductID.
    Aggregate transaction data for each customer.

In [32]:
# Merge customers with their transactions
merged_data = pd.merge(transactions, customers, on='CustomerID', how='left')

# Merge with products to get product information
merged_data = pd.merge(merged_data, products[['ProductID', 'Category', 'Price']], on='ProductID', how='left')

# Aggregate data for each customer (total spent, number of transactions, etc.)
customer_data = merged_data.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),
    num_transactions=('TransactionID', 'nunique'),
    avg_spent_per_transaction=('TotalValue', 'mean')
).reset_index()

# Include customer demographic data (like region)
customer_data = pd.merge(customer_data, customers[['CustomerID', 'Region']], on='CustomerID', how='left')


Step 4: Feature Engineering

Now, create features based on transaction history and customer demographics.

    Normalize the numerical features (total spent, number of transactions, etc.).
    One-hot encode categorical features like region

In [33]:
# Normalize numerical features using StandardScaler
scaler = StandardScaler()
customer_data[['total_spent', 'num_transactions', 'avg_spent_per_transaction']] = scaler.fit_transform(
    customer_data[['total_spent', 'num_transactions', 'avg_spent_per_transaction']]
)

# One-hot encode the 'Region' feature
customer_data = pd.get_dummies(customer_data, columns=['Region'], drop_first=True)

# Check the final processed data
print(customer_data.head())


  CustomerID  total_spent  num_transactions  avg_spent_per_transaction  \
0      C0001    -0.061701         -0.011458                  -0.070263   
1      C0002    -0.877744         -0.467494                  -0.934933   
2      C0003    -0.405857         -0.467494                  -0.026271   
3      C0004     1.032547          1.356650                  -0.076769   
4      C0005    -0.783929         -0.923530                  -0.040028   

   Region_Europe  Region_North America  Region_South America  
0              0                     0                     1  
1              0                     0                     0  
2              0                     0                     1  
3              0                     0                     1  
4              0                     0                     0  


Step 5: Build the Lookalike Model

Now, we'll compute the similarity between customers using Cosine Similarity.

    Cosine similarity measures the angle between two vectors. The closer the angle is to 0, the more similar the customers are.

In [34]:
# Compute the cosine similarity matrix
customer_features = customer_data.drop('CustomerID', axis=1)  # Drop CustomerID for similarity calculation
similarity_matrix = cosine_similarity(customer_features)

# Convert the similarity matrix to a DataFrame
similarity_df = pd.DataFrame(similarity_matrix, index=customer_data['CustomerID'], columns=customer_data['CustomerID'])

# Check the similarity matrix for customer C0001
print(similarity_df['C0001'].sort_values(ascending=False).head(5))


CustomerID
C0001    1.000000
C0137    0.999762
C0152    0.999510
C0107    0.964169
C0191    0.945666
Name: C0001, dtype: float64


Step 6: Generate Top 3 Lookalikes for Each Customer

Now, for each customer, we'll find their top 3 most similar customers.

In [35]:
# Create a function to get the top 3 most similar customers
def get_top_3_lookalikes(customer_id, similarity_df):
    similar_customers = similarity_df[customer_id].sort_values(ascending=False)[1:4]  # Exclude the customer itself
    return similar_customers.index.tolist(), similar_customers.values.tolist()

# Prepare the Lookalike.csv data
lookalike_data = []
for customer_id in customer_data['CustomerID'].head(20):  # For C0001 to C0020
    lookalike_ids, scores = get_top_3_lookalikes(customer_id, similarity_df)
    for i, lookalike_id in enumerate(lookalike_ids):
        lookalike_data.append([customer_id, lookalike_id, scores[i]])

# Create a DataFrame for the lookalikes
lookalike_df = pd.DataFrame(lookalike_data, columns=['CustomerID', 'LookalikeID', 'SimilarityScore'])

# Save the results to a CSV file
lookalike_df.to_csv('Lookalike.csv', index=False)


Step 7: Output the Results

The Lookalike.csv file will contain the top 3 lookalikes for each of the first 20 customers (CustomerID: C0001 - C0020) along with their similarity scores.

CustomerID,LookalikeID,SimilarityScore
C0001,C0137,0.9997616475513844
C0001,C0152,0.9995103764905026
C0001,C0107,0.9641690971679807
C0002,C0043,0.9928719995078378
C0002,C0142,0.9808733720413273
C0002,C0097,0.9755037432206866
C0003,C0133,0.98706169870118
C0003,C0052,0.9754103448062152
C0003,C0112,0.9415781873172182
C0004,C0108,0.9827182639674003
C0004,C0113,0.9785391786723804
C0004,C0165,0.9738306048704476
C0005,C0178,0.9990813407657672
C0005,C0159,0.9989277207729346
C0005,C0123,0.997904546931205
C0006,C0168,0.978264001321209
C0006,C0158,0.9711446209691191
C0006,C0171,0.9387564178572096
C0007,C0140,0.9978130376911358
C0007,C0092,0.9926503090011995
C0007,C0193,0.9911395081011377
C0008,C0139,0.9721988310054218
C0008,C0109,0.9698722815670315
C0008,C0098,0.9307192939474349
C0009,C0060,0.9818423255580481
C0009,C0010,0.9807961382416003
C0009,C0121,0.9798524547998502
C0010,C0199,0.9923026397421746
C0010,C0009,0.9807961382416003
C0010,C0121,0.9758346484079298
C0011,C0107,0.9948966873068459
C0011,C0048,0.9941231099438982
C0011,C0152,0.9433269615596138
C0012,C0155,0.9982299602323986
C0012,C0108,0.9846023642070699
C0012,C0102,0.9836692611849275
C0013,C0087,0.9898279481555508
C0013,C0188,0.9888834548638371
C0013,C0099,0.9861744175216884
C0014,C0198,0.9889759898914686
C0014,C0060,0.9756975141606177
C0014,C0009,0.9615398807230745
C0015,C0144,0.9895384745128599
C0015,C0036,0.9834692674657693
C0015,C0131,0.9772816942628351
C0016,C0183,0.9997857762658623
C0016,C0026,0.8763642657316434
C0016,C0018,0.8656919178748971
C0017,C0124,0.9795659241713703
C0017,C0075,0.9692050858710901
C0017,C0162,0.8444702818939048
C0018,C0079,0.9391808907106641
C0018,C0117,0.9299131166822665
C0018,C0016,0.8656919178748971
C0019,C0172,0.9999897107239059
C0019,C0111,0.9660184815998832
C0019,C0119,0.9535765014314451
C0020,C0042,0.9474090450742292
C0020,C0176,0.9443114466284428
C0020,C0110,0.9283838464103554
