In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [18]:

transactions_df = pd.read_csv(r'C:\Users\SIVABALAJI S\Desktop\ZEOTAP_Data\Transactions.csv')
products_df = pd.read_csv(r'C:\Users\SIVABALAJI S\Desktop\ZEOTAP_Data\Products (2).csv')
customers_df = pd.read_csv(r'C:\Users\SIVABALAJI S\Desktop\ZEOTAP_Data\Customers.csv')


In [19]:

print("Transactions Dataset:")
print(transactions_df.head())
print("\nProducts Dataset:")
print(products_df.head())
print("\nCustomers Dataset:")
print(customers_df.head())


Transactions Dataset:
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue   Price  
0      300.68  300.68  
1      300.68  300.68  
2      300.68  300.68  
3      601.36  300.68  
4      902.04  300.68  

Products Dataset:
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31

Customers Dataset:
  CustomerID        

In [20]:

transactions_with_products = transactions_df.merge(products_df, on="ProductID", how="left")



In [21]:

customer_transactions = transactions_with_products.groupby("CustomerID").agg({
    "Category": lambda x: " ".join(x),  
    "TotalValue": "sum"  
}).reset_index()

In [22]:

customer_profile = customers_df.merge(customer_transactions, on="CustomerID", how="left")



In [23]:

customer_profile["Category"] = customer_profile["Category"].fillna("")
customer_profile["TotalValue"] = customer_profile["TotalValue"].fillna(0)



In [24]:

tfidf = TfidfVectorizer()
customer_vectors = tfidf.fit_transform(
    customer_profile["Region"] + " " + customer_profile["Category"]
)

In [25]:

total_value_normalized = (customer_profile["TotalValue"].values.reshape(-1, 1) - 
                          np.min(customer_profile["TotalValue"])) / \
                         (np.max(customer_profile["TotalValue"]) - 
                          np.min(customer_profile["TotalValue"]))
feature_matrix = np.hstack((customer_vectors.toarray(), total_value_normalized))

In [26]:

similarities = cosine_similarity(feature_matrix)

In [27]:

top_20_customers = customer_profile[customer_profile["CustomerID"].isin([f"C{str(i).zfill(4)}" for i in range(1, 21)])]



In [28]:
lookalikes = {}
for idx, cust_id in enumerate(top_20_customers["CustomerID"]):
    similar_indices = np.argsort(similarities[idx])[::-1][1:4]  
    similar_customers = [(customer_profile.iloc[i]["CustomerID"], similarities[idx][i]) for i in similar_indices]
    lookalikes[cust_id] = similar_customers



In [29]:

lookalike_data = [
    {"cust_id": cust_id, "lookalikes": [{"cust_id": pair[0], "score": pair[1]} for pair in pairs]}
    for cust_id, pairs in lookalikes.items()
]

In [30]:

lookalike_df = pd.DataFrame({
    "cust_id": [item["cust_id"] for item in lookalike_data],
    "lookalike_data": [item["lookalikes"] for item in lookalike_data]
})

In [31]:


output_file = r"C:\Users\SIVABALAJI S\Desktop\ZEOTAP_Data\SIVABALAJI_S_Lookalike.csv"
lookalike_df.to_csv(output_file, index=False)

output_file


'C:\\Users\\SIVABALAJI S\\Desktop\\ZEOTAP_Data\\SIVABALAJI_S_Lookalike.csv'

In [32]:

lookalike_df.head()


Unnamed: 0,cust_id,lookalike_data
0,C0001,"[{'cust_id': 'C0091', 'score': 0.9463380741608..."
1,C0002,"[{'cust_id': 'C0134', 'score': 0.9796067623069..."
2,C0003,"[{'cust_id': 'C0031', 'score': 0.9979348784702..."
3,C0004,"[{'cust_id': 'C0113', 'score': 0.9761924817289..."
4,C0005,"[{'cust_id': 'C0007', 'score': 0.9988083438913..."
