In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# Load datasets
customers = pd.read_csv("C:/Users/User/Downloads/Customers.csv")
transactions = pd.read_csv("C:/Users/User/Downloads/Transactions.csv")
products=pd.read_csv("C:/Users/User/Downloads/Products.csv")


In [4]:
customers.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15


In [5]:
transactions.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68


In [6]:
total_spent = transactions.groupby("CustomerID")["TotalValue"].sum().reset_index()
total_spent.columns = ["CustomerID", "Total_Spent"]

In [7]:
transaction_count = transactions.groupby("CustomerID")["TransactionID"].count().reset_index()
transaction_count.columns = ["CustomerID", "Transaction_Count"]

In [8]:
unique_products = transactions.groupby("CustomerID")["ProductID"].nunique().reset_index()
unique_products.columns = ["CustomerID", "Unique_Products"]

In [9]:
# Merge data
customer_data = customers.merge(total_spent, on="CustomerID", how="left").fillna(0)
customer_data = customer_data.merge(transaction_count, on="CustomerID", how="left").fillna(0)
customer_data = customer_data.merge(unique_products, on="CustomerID", how="left").fillna(0)

customer_data.set_index("CustomerID", inplace=True)

In [10]:
# Normalize numerical features
scaler = StandardScaler()
numeric_features = customer_data.select_dtypes(include=[np.number])
customer_data_scaled = pd.DataFrame(scaler.fit_transform(numeric_features), index=customer_data.index)

In [11]:
# Compute similarity matrix
similarity_matrix = cosine_similarity(customer_data_scaled)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_data.index, columns=customer_data.index)

In [12]:
# Get top 3 similar customers for C0001 - C0020
lookalike_dict = {}
customer_ids = customer_data.index[:20]
for cust_id in customer_ids:
    top_3 = similarity_df.loc[cust_id].drop(cust_id).nlargest(3)
    lookalike_dict[cust_id] = list(zip(top_3.index, top_3.values))

In [13]:
# Convert to CSV format
lookalike_data = []
for cust_id, lookalikes in lookalike_dict.items():
    row = [cust_id] + [item for pair in lookalikes for item in pair]
    lookalike_data.append(row)

lookalike_df = pd.DataFrame(lookalike_data, columns=["cust_id", "lookalike_1", "score_1", "lookalike_2", "score_2", "lookalike_3", "score_3"])
lookalike_df.to_csv("Lookalike.csv", index=False)

print("Lookalike.csv created successfully!")

Lookalike.csv created successfully!
