In [6]:
import pandas as pd

# Load data
customers_df = pd.read_csv('Customers.csv')
transactions_df = pd.read_csv('Transactions.csv')
products_df = pd.read_csv('Products.csv')

# Check missing values and cleaning of data
customers_df.ffill(inplace=True)
transactions_df.fillna(0, inplace=True)


In [17]:
transactions_df.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68


In [18]:
customers_df.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15


In [8]:
# Aggregate transaction data
transaction_agg = transactions_df.groupby('CustomerID').agg(
    total_spend=('TotalValue', 'sum'),
    purchase_frequency=('TransactionID', 'count'),
    product_categories=('ProductID', lambda x: list(set(x)))
).reset_index()

# Merge customer's profile with aggregated transaction data
customer_data = pd.merge(customers_df, transaction_agg, on='CustomerID')


In [14]:
customer_data

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate,total_spend,purchase_frequency,product_categories
0,C0001,Lawrence Carroll,South America,2022-07-10,3354.52,5,"[P022, P096, P083, P029, P054]"
1,C0002,Elizabeth Lutz,Asia,2022-02-13,1862.74,4,"[P095, P004, P019, P071]"
2,C0003,Michael Rivera,South America,2024-03-07,2725.38,4,"[P006, P002, P035, P025]"
3,C0004,Kathleen Rodriguez,South America,2022-10-09,5354.88,8,"[P077, P097, P038, P049, P008, P025, P024, P053]"
4,C0005,Laura Weber,Asia,2022-08-15,2034.24,3,"[P012, P039, P025]"
...,...,...,...,...,...,...,...
194,C0196,Laura Watts,Europe,2022-06-07,4982.88,4,"[P079, P018, P020]"
195,C0197,Christina Harvey,Europe,2023-03-21,1928.65,3,"[P084, P027, P013]"
196,C0198,Rebecca Ray,Europe,2022-02-27,931.83,2,"[P064, P073]"
197,C0199,Andrea Jenkins,Europe,2022-12-03,1979.28,4,"[P022, P067, P079, P008]"


In [11]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Convert customer profiles and transaction data into numeric
customer_features = customer_data[[ 'total_spend', 'purchase_frequency']]  

# Normalizing the feature
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
customer_features_scaled = scaler.fit_transform(customer_features)

# Calculating cosine similarity between customers
similarity_matrix = cosine_similarity(customer_features_scaled)

# Display the similarity matrix 
similarity_df = pd.DataFrame(similarity_matrix, columns=customer_data['CustomerID'], index=customer_data['CustomerID'])


In [19]:
similarity_df

CustomerID,C0001,C0002,C0003,C0004,C0005,C0006,C0007,C0008,C0009,C0010,...,C0191,C0192,C0193,C0194,C0195,C0196,C0197,C0198,C0199,C0200
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0001,1.000000,0.953612,0.782426,-0.740746,0.775453,-0.517069,0.619098,-0.366424,0.922045,0.963577,...,0.990353,0.933706,0.625366,-0.410512,-0.713431,-0.766735,0.797223,0.825839,0.943515,-0.980100
C0002,0.953612,1.000000,0.933600,-0.908617,0.929558,-0.235412,0.826788,-0.629527,0.995800,0.999386,...,0.902699,0.998177,0.831266,-0.665972,-0.891281,-0.537913,0.941973,0.957287,0.999490,-0.874878
C0003,0.782426,0.933600,1.000000,-0.997929,0.999938,0.128465,0.973447,-0.866130,0.962485,0.920468,...,0.688587,0.953523,0.975249,-0.889047,-0.994579,-0.200135,0.999709,0.997327,0.944564,-0.643238
C0004,-0.740746,-0.908617,-0.997929,1.000000,-0.998582,-0.191994,-0.986156,0.896489,-0.943037,-0.893421,...,-0.640514,-0.932165,-0.987453,0.916656,0.999208,0.136694,-0.996086,-0.990561,-0.921487,0.592652
C0005,0.775453,0.929558,0.999938,-0.998582,1.000000,0.139485,0.975933,-0.871635,0.959408,0.916065,...,0.680481,0.950114,0.977647,-0.894083,-0.995674,-0.189228,0.999379,0.996453,0.940855,-0.634684
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C0196,-0.766735,-0.537913,-0.200135,0.136694,-0.189228,0.945940,0.029459,-0.316363,-0.458473,-0.567129,...,-0.848293,-0.486057,0.021454,-0.270624,0.097172,1.000000,-0.223715,-0.271193,-0.510725,0.878910
C0197,0.797223,0.941973,0.999709,-0.996086,0.999379,0.104502,0.967641,-0.853820,0.968751,0.929629,...,0.705882,0.960515,0.969631,-0.877743,-0.991781,-0.223715,1.000000,0.998799,0.952210,-0.661523
C0198,0.825839,0.957287,0.997327,-0.990561,0.996453,0.055656,0.954118,-0.827293,0.979739,0.946565,...,0.739734,0.972992,0.956485,-0.853217,-0.984322,-0.271193,0.998799,1.000000,0.966030,-0.697467
C0199,0.943515,0.999490,0.944564,-0.921487,0.940855,-0.204263,0.844326,-0.654012,0.998215,0.997757,...,0.888502,0.999595,0.848589,-0.689449,-0.905304,-0.510725,0.952210,0.966030,1.000000,-0.858969


In [12]:
lookalike_map = {}

# For each customer, get the top 3 similar customers
for index, customer_id in enumerate(customer_data['CustomerID'].iloc[:20]):
    # similarity scores for this customer (excluding customer itself)
    similarity_scores = similarity_matrix[index]
    
    # Sorting similarity scores and get the top 3 
    similar_customers = np.argsort(similarity_scores)[::-1][1:4]  # Exclude the customer at index 0
    
    # Store the result
    similar_customers_ids = customer_data['CustomerID'].iloc[similar_customers].values
    similar_scores = similarity_scores[similar_customers]
    
    lookalike_map[customer_id] = [(similar_customers_ids[i], similar_scores[i]) for i in range(3)]


In [13]:

lookalike_list = []
for customer_id, similar_data in lookalike_map.items():
    for similar_customer, score in similar_data:
        lookalike_list.append([customer_id, similar_customer, score])

lookalike_df = pd.DataFrame(lookalike_list, columns=['CustomerID', 'LookalikeCustomerID', 'SimilarityScore'])



In [20]:
lookalike_df

Unnamed: 0,CustomerID,LookalikeCustomerID,SimilarityScore
0,C0001,C0137,0.999567
1,C0001,C0152,0.997683
2,C0001,C0056,0.993947
3,C0002,C0029,0.999816
4,C0002,C0199,0.99949
5,C0002,C0010,0.999386
6,C0003,C0095,0.999999
7,C0003,C0150,0.99999
8,C0003,C0144,0.999981
9,C0004,C0067,0.999993


In [21]:
# Save to CSV
lookalike_df.to_csv('Lookalike.csv', index=False)
