In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import csv

In [35]:
customers_data = pd.read_csv(r"C:\Users\ASUS\Documents\project_zeotap\Customers.csv")  # Customer profile data
transactions_data = pd.read_csv(r"C:\Users\ASUS\Documents\project_zeotap\Transactions.csv")  # Customer transaction data
products_df = pd.read_csv(r"C:\Users\ASUS\Documents\project_zeotap\Products.csv")  # Transaction data

In [37]:
merged_data = pd.merge(transactions_data, customers_data[['CustomerID', 'Region']], on='CustomerID', how='left')


In [39]:
customer_features = merged_data.groupby('CustomerID').agg(
    total_spent=pd.NamedAgg(column='TotalValue', aggfunc='sum'),
    num_purchases=pd.NamedAgg(column='TotalValue', aggfunc='count'),
    region=pd.NamedAgg(column='Region', aggfunc=lambda x: x.mode()[0])  # Most common region
).reset_index()

In [41]:
merged_data = pd.merge(merged_data, products_df[['ProductID', 'Category']], on='ProductID', how='left')


In [43]:
category_features = merged_data.groupby('CustomerID')['Category'].value_counts().unstack(fill_value=0)
category_features.reset_index(inplace=True)


In [45]:
final_features = pd.merge(customer_features, category_features, on='CustomerID', how='left')


In [47]:
scaler = StandardScaler()
numerical_cols = ['total_spent', 'num_purchases']
final_features[numerical_cols] = scaler.fit_transform(final_features[numerical_cols])

In [49]:
final_features = pd.get_dummies(final_features, columns=['region'], drop_first=True)


In [51]:

customer_category_matrix = category_features.set_index('CustomerID')



In [53]:
# Calculate cosine similarity between customers based on product categories
category_similarity = cosine_similarity(customer_category_matrix)



In [55]:
# Content-Based Filtering: Based on customer profile (spending, number of purchases, region)
profile_matrix = final_features.drop(columns=['CustomerID'])

In [57]:
# Calculate cosine similarity between customers based on profile features
profile_similarity = cosine_similarity(profile_matrix)



In [59]:
# Combine the two similarity matrices (equal weightage for both methods)
final_similarity_matrix = (category_similarity + profile_similarity) / 2

In [61]:
def get_top_3_similar_customers(customer_id, similarity_matrix, n=3):
    # Get the similarity scores for the given customer and sort by similarity (excluding the customer itself)
    similarity_scores = similarity_matrix[customer_id]
    similar_customers = np.argsort(similarity_scores)[::-1][1:n+1]  # Get top n similar customers (excluding the self)
    scores = similarity_scores[similar_customers]
    return similar_customers.tolist(), scores.tolist()

In [63]:
# Create a dictionary to store the recommendations
lookalike_dict = {}

# For customers C0001 to C0020, recommend top 3 similar customers
for customer_id_int in range(1, 21):  # C0001 to C0020
    customer_id_str = f'C{customer_id_int:04d}'
    customer_index = final_features[final_features['CustomerID'] == customer_id_str].index[0]  # Get index for similarity matrix
    similar_customers, similarity_scores = get_top_3_similar_customers(customer_index, final_similarity_matrix)
    
    # Map CustomerID to their lookalikes with similarity scores
    similar_customer_ids = final_features['CustomerID'].iloc[similar_customers].tolist()
    lookalike_dict[customer_id_str] = list(zip(similar_customer_ids, similarity_scores))

In [65]:
import csv

with open('Lookalike.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["cust_id", "similar_customers_and_scores"])
    for key, value in lookalike_dict.items():
        writer.writerow([key, value])

print("Lookalike recommendations have been successfully saved to 'Lookalike.csv'")

Lookalike recommendations have been successfully saved to 'Lookalike.csv'


In [72]:
lookalike_df = pd.read_csv('Lookalike.csv')

In [78]:
print("\nTop 3 lookalikes with similarity scores:")
lookalike_df.head().reset_index()


Top 3 lookalikes with similarity scores:


Unnamed: 0,index,cust_id,similar_customers_and_scores
0,0,C0001,"[('C0146', 0.9486577745774705), ('C0127', 0.92..."
1,1,C0002,"[('C0134', 0.9671737859185701), ('C0133', 0.96..."
2,2,C0003,"[('C0031', 0.9951514119095752), ('C0158', 0.98..."
3,3,C0004,"[('C0113', 0.9575171308333958), ('C0047', 0.93..."
4,4,C0005,"[('C0007', 0.9966792358944401), ('C0197', 0.96..."


In [83]:
lookalike_df

Unnamed: 0,cust_id,similar_customers_and_scores
0,C0001,"[('C0146', 0.9486577745774705), ('C0127', 0.92..."
1,C0002,"[('C0134', 0.9671737859185701), ('C0133', 0.96..."
2,C0003,"[('C0031', 0.9951514119095752), ('C0158', 0.98..."
3,C0004,"[('C0113', 0.9575171308333958), ('C0047', 0.93..."
4,C0005,"[('C0007', 0.9966792358944401), ('C0197', 0.96..."
5,C0006,"[('C0187', 0.9478437526382896), ('C0135', 0.91..."
6,C0007,"[('C0005', 0.9966792358944401), ('C0197', 0.96..."
7,C0008,"[('C0162', 0.9703401645516392), ('C0154', 0.96..."
8,C0009,"[('C0198', 0.9395801156335459), ('C0092', 0.90..."
9,C0010,"[('C0061', 0.9771907377004081), ('C0176', 0.92..."
