In [9]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

# Loading datasets
customers_df = pd.read_csv("data\Customers.csv")
products_df = pd.read_csv("data\Products.csv")
transactions_df = pd.read_csv("data\Transactions.csv")

# Merge datasets
transactions_products = pd.merge(transactions_df, products_df, on="ProductID", how="left")
merged_df = pd.merge(transactions_products, customers_df, on="CustomerID", how="left")

# Feature Engineering: Aggregating data at the customer level
customer_features = merged_df.groupby("CustomerID").agg(
    total_transactions=("TransactionID", "count"),
    total_spent=("TotalValue", "sum"),
    avg_spent=("TotalValue", "mean"),
    unique_categories=("Category", "nunique"),
).reset_index()

# Normalize features for similarity calculation
scaler = MinMaxScaler()
normalized_features = scaler.fit_transform(
    customer_features[["total_transactions", "total_spent", "avg_spent", "unique_categories"]]
)

# Computing cosine similarity between customers
similarity_matrix = cosine_similarity(normalized_features)

# Creating a DataFrame for similarity results
similarity_df = pd.DataFrame(
    similarity_matrix,
    index=customer_features["CustomerID"],
    columns=customer_features["CustomerID"]
)

# Function to get top N similar customers for a given customer ID
def get_top_similar_customers(customer_id, n=3):
    similar_customers = similarity_df[customer_id].sort_values(ascending=False).iloc[1:n+1]
    return list(zip(similar_customers.index, similar_customers.values))

# Getting top 3 similar customers for the first 20 customers
lookalike_results = {
    customer: get_top_similar_customers(customer) for customer in customer_features["CustomerID"][:20]
}

# Create a DataFrame to save the lookalike results
lookalike_df = pd.DataFrame({
    "CustomerID": lookalike_results.keys(),
    "Lookalikes": [str(value) for value in lookalike_results.values()]
})

# Saving the results to a CSV file
lookalike_df.to_csv("Rahul_Kumar_Lookalike.csv", index=False)

# Displing the first few rows of the lookalike results
print(lookalike_df.head())


  CustomerID                                         Lookalikes
0      C0001  [('C0152', 0.9999879625100291), ('C0056', 0.99...
1      C0002  [('C0199', 0.9993927667959704), ('C0010', 0.99...
2      C0003  [('C0178', 0.9999947067947413), ('C0035', 0.99...
3      C0004  [('C0101', 0.9998309789789165), ('C0108', 0.99...
4      C0005  [('C0073', 0.9999758789813392), ('C0159', 0.99...
