# Importing Necessary Dependencies

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Load the data

In [2]:

customers = pd.read_csv("./data/Customers.csv")
products = pd.read_csv("./data/Products.csv")
transactions = pd.read_csv("./data/Transactions.csv")

# Merge data to create a unified dataset

In [3]:
transactions = transactions.merge(products, on="ProductID", how="left")
transactions = transactions.merge(customers, on="CustomerID", how="left")

# Feature Engineering

### Aggregate transaction data to customer level

In [4]:
customer_features = transactions.groupby("CustomerID").agg(
    total_spent=("TotalValue", "sum"),
    total_transactions=("TransactionID", "count"),
    avg_transaction_value=("TotalValue", "mean"),
    distinct_products=("ProductID", "nunique"),
    most_frequent_category=("Category", lambda x: x.mode()[0]),
).reset_index()

### Encode categorical features (e.g., Region, most_frequent_category)

In [5]:
customer_features = customer_features.merge(customers[["CustomerID", "Region"]], on="CustomerID", how="left")
customer_features = pd.get_dummies(customer_features, columns=["Region", "most_frequent_category"], drop_first=True)

# Standardize the features

In [6]:
scaler = StandardScaler()
features = customer_features.drop(columns=["CustomerID"])
scaled_features = scaler.fit_transform(features)

# Calculate similarity using Cosine Similarity

In [7]:
similarity_matrix = cosine_similarity(scaled_features)

# Generate Lookalike Recommendations

In [8]:
lookalike_results = {}
customer_ids = customer_features["CustomerID"].values

for idx, customer_id in enumerate(customer_ids[:20]):  
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)  
    top_lookalikes = [
        (customer_ids[i], score) for i, score in similarity_scores[1:4]
    ]
    lookalike_results[customer_id] = top_lookalikes

# Save results to Lookalike.csv


In [9]:
lookalike_df = pd.DataFrame([
    {
        "CustomerID": customer_id,
        "Lookalikes": str([(lookalike[0], round(lookalike[1], 2)) for lookalike in lookalikes])
    }
    for customer_id, lookalikes in lookalike_results.items()
])

lookalike_df.to_csv("Sujal_Maheshwari_Lookalike.csv", index=False)

print("Lookalike.csv generated with top 3 similar customers for C0001-C0020.")

Lookalike.csv generated with top 3 similar customers for C0001-C0020.
