In [28]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np

# Load data
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')

# Aggregate transaction data by customer
transaction_summary = transactions.groupby('CustomerID').agg({'TotalValue': 'sum', 'Quantity': 'sum'}).reset_index()

# Merge customer and transaction data
customer_data = pd.merge(customers, transaction_summary, on='CustomerID')

# Process features for similarity comparison
features = customer_data[['TotalValue', 'Quantity']] 

# Normalize data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Calculate Cosine Similarity
cosine_sim = cosine_similarity(features_scaled)

# Function to extract top 3 lookalikes for each customer:
def get_top_n_similar(customer_id, cosine_sim, n=3):
    idx = customer_data[customer_data['CustomerID'] == customer_id].index[0]
    similarity_scores = list(enumerate(cosine_sim[idx]))
    sorted_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)[1:n+1]
    similar_customers = [(customer_data.iloc[i[0]]['CustomerID'], i[1]) for i in sorted_scores]
    return similar_customers

# Example: Get top 3 similar customers for C0001
top_similar_customers = get_top_n_similar('C0001', cosine_sim)
print(top_similar_customers)
# Example: Get top 3 similar customers for C0002
top_similar_customers = get_top_n_similar('C0002', cosine_sim)
print(top_similar_customers)
# Example: Get top 3 similar customers for C0003
top_similar_customers = get_top_n_similar('C0003', cosine_sim)
print(top_similar_customers)

[('C0085', 0.9999990504724361), ('C0042', 0.9998215747742084), ('C0089', 0.9997850140987701)]
[('C0157', 0.9999942410168485), ('C0166', 0.999875010843091), ('C0029', 0.9998254255985104)]
[('C0111', 0.9940081095432594), ('C0160', 0.9904545038572361), ('C0147', 0.9876382719212549)]


In [27]:
# Lookalikes for first 20 customers (C0001 to C0020)
lookalike_map = {}
for i in range(1, 21):
    customer_id = f'C{i:04d}'
    top_similar_customers = get_top_n_similar(customer_id, cosine_sim)
    lookalike_map[customer_id] = top_similar_customers
# Convert the lookalike_map to a DataFrame
lookalike_data = []
for customer_id, similar_customers in lookalike_map.items():
    for similar_customer, score in similar_customers:
        lookalike_data.append([customer_id, similar_customer, score])

lookalike_df = pd.DataFrame(lookalike_data, columns=['CustomerID', 'LookalikeID', 'SimilarityScore'])

# Save to CSV
lookalike_df.to_csv('Lookalike.csv', index=False)

# Display the entries of the lookalike dataframe
print(lookalike_df.head(70))


   CustomerID LookalikeID  SimilarityScore
0       C0001       C0085         0.999999
1       C0001       C0042         0.999822
2       C0001       C0089         0.999785
3       C0002       C0157         0.999994
4       C0002       C0166         0.999875
5       C0002       C0029         0.999825
6       C0003       C0111         0.994008
7       C0003       C0160         0.990455
8       C0003       C0147         0.987638
9       C0004       C0162         1.000000
10      C0004       C0165         0.999959
11      C0004       C0090         0.998641
12      C0005       C0080         0.999982
13      C0005       C0167         0.999975
14      C0005       C0177         0.999928
15      C0006       C0079         0.999966
16      C0006       C0117         0.995670
17      C0006       C0196         0.990317
18      C0007       C0146         0.999990
19      C0007       C0125         0.999837
20      C0007       C0061         0.999716
21      C0008       C0109         0.999842
22      C00