<a href="https://colab.research.google.com/github/pran2681/Zeotap-Assignment/blob/main/Pran_Saikia_Lookalike.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
import pandas as pd
from sklearn.preprocessing import StandardScaler


In [19]:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [20]:
merged_data = pd.merge(transactions, customers, on='CustomerID', how='left')

In [21]:
merged_data = pd.merge(merged_data, products[['ProductID', 'Category']], on='ProductID', how='left')

In [22]:
customer_summary = merged_data.groupby('CustomerID').agg(
    total_spend=('TotalValue', 'sum'),
    transaction_count=('TransactionDate', 'count'),
    product_categories=('Category', lambda x: list(x.unique()))
).reset_index()

In [23]:
scaler = StandardScaler()
customer_summary[['total_spend', 'transaction_count']] = scaler.fit_transform(customer_summary[['total_spend', 'transaction_count']])


In [24]:
customer_summary.head()


Unnamed: 0,CustomerID,total_spend,transaction_count,product_categories
0,C0001,-0.061701,-0.011458,"[Books, Home Decor, Electronics]"
1,C0002,-0.877744,-0.467494,"[Home Decor, Clothing]"
2,C0003,-0.405857,-0.467494,"[Home Decor, Clothing, Electronics]"
3,C0004,1.032547,1.35665,"[Books, Home Decor, Electronics]"
4,C0005,-0.783929,-0.92353,"[Home Decor, Electronics]"


Similarity Calculation (Cosine Similarity)


In [25]:
from sklearn.metrics.pairwise import cosine_similarity

In [26]:
categories = merged_data[['CustomerID', 'Category']].drop_duplicates()
categories_encoded = pd.get_dummies(categories['Category'], prefix='Category')

encoded_features = pd.merge(customer_summary[['CustomerID']].reset_index(drop=True), categories_encoded.reset_index(drop=True), left_index=True, right_index=True, how='left').drop(columns=['CustomerID'])

final_features = pd.concat([customer_summary[['total_spend', 'transaction_count']], encoded_features], axis=1)

similarity_matrix = cosine_similarity(final_features)

similarity_matrix[:5, :5]

array([[ 1.        ,  0.74978673,  0.87437579,  0.46492596,  0.67281893],
       [ 0.74978673,  1.        ,  0.94940858, -0.19391329,  0.95688917],
       [ 0.87437579,  0.94940858,  1.        , -0.02292501,  0.94719033],
       [ 0.46492596, -0.19391329, -0.02292501,  1.        , -0.34216919],
       [ 0.67281893,  0.95688917,  0.94719033, -0.34216919,  1.        ]])

Recommendation Generation

In [27]:
import numpy as np

def get_top_3_similar_customers(customer_id, similarity_matrix, customer_ids):
    customer_index = customer_ids.index(customer_id)
    similarity_scores = similarity_matrix[customer_index]

    top_3_indices = np.argsort(similarity_scores)[-4:-1]
    top_3_customers = [(customer_ids[i], similarity_scores[i]) for i in top_3_indices]

    return top_3_customers

customer_ids = customer_summary['CustomerID'].tolist()

recommendations = []

for customer_id in customer_ids[:20]:
    top_3 = get_top_3_similar_customers(customer_id, similarity_matrix, customer_ids)
    recommendations.append([customer_id] + [item for sublist in top_3 for item in sublist])

recommendations_df = pd.DataFrame(recommendations, columns=['CustomerID', 'Recommended_Customer1', 'Score1', 'Recommended_Customer2', 'Score2', 'Recommended_Customer3', 'Score3'])

recommendations_df.head()

recommendations_df.to_csv('Lookalike.csv', index=False)
