In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

In [3]:
df = transactions.merge(products, on="ProductID").merge(customers, on="CustomerID")

In [4]:
df.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,ProductName,Category,Price_y,CustomerName,Region,SignupDate
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Andrea Jenkins,Europe,2022-12-03
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Brittany Harvey,Asia,2024-09-04
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Kathryn Stevens,Europe,2024-04-04
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Travis Campbell,South America,2024-04-11
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Timothy Perez,Europe,2022-03-15


In [6]:
customer_features = df.groupby("CustomerID").agg({
    "TotalValue": "sum",
    "Quantity": "sum",
    "Price_x": "mean",
    "Category": lambda x: ','.join(x.unique()),
    "Region": "first"
}).reset_index()

In [7]:
encoder = OneHotEncoder()
encoded_data = encoder.fit_transform(customer_features[["Category", "Region"]]).toarray()
encoded_columns = encoder.get_feature_names_out(["Category", "Region"])
encoded_df = pd.DataFrame(encoded_data, columns=encoded_columns)

In [9]:
final_features = pd.concat([customer_features[["CustomerID", "TotalValue", "Quantity", "Price_x"]], encoded_df], axis=1)

In [10]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(final_features.drop(columns=["CustomerID"]))

In [11]:
similarity_matrix = cosine_similarity(scaled_features)

In [12]:
customer_ids = final_features["CustomerID"].values
similarity_df = pd.DataFrame(similarity_matrix, index=customer_ids, columns=customer_ids)

In [14]:
lookalike_results = {}
target_customers = customer_ids[:20]

for customer in target_customers:
    similar_customers = similarity_df[customer].drop(customer).sort_values(ascending=False).head(3)
    lookalike_results[customer] = list(zip(similar_customers.index, similar_customers.values))

In [17]:
lookalike_list = [
    {
        "cust_id": k,
        "lookalikes": [(cust, float(score)) for cust, score in v]  # Convert np.float64 to float
    }
    for k, v in lookalike_results.items()
]
lookalike_df = pd.DataFrame(lookalike_list)
lookalike_df.to_csv("Sanidhya_Yadav_Lookalike.csv", index=False)

print("Lookalike Model Completed. Output saved to Lookalike.csv")

Lookalike Model Completed. Output saved to Lookalike.csv
