In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

customers_df = pd.read_csv("Customers.csv")
transactions_df = pd.read_csv("Transactions.csv")
products_df = pd.read_csv("Products.csv")

In [23]:
merged_df = transactions_df.merge(customers_df, on="CustomerID").merge(products_df, on="ProductID")

In [24]:
customer_features = merged_df.groupby('CustomerID').agg({
    'TotalValue': 'sum',         # Total spending
    'TransactionID': 'count',    # Transaction count
    'Quantity': 'sum',           # Total quantity purchased
    'Price_y': 'mean'            # Average product price (using Price_y from merged_df)
}).rename(columns={
    'TotalValue': 'TotalSpending',
    'TransactionID': 'TransactionCount',
    'Quantity': 'TotalQuantity',
    'Price_y': 'AvgPrice'
}).reset_index()

In [25]:
# Merge with customer demographic data
customer_features = customer_features.merge(customers_df, on="CustomerID")

In [26]:
# Handle missing values
customer_features.fillna(0, inplace=True)

In [27]:
# Encode Categorical Variables
customer_features = pd.get_dummies(customer_features, columns=["Region"], drop_first=True)

In [28]:
# Normalize Numeric Features
scaler = StandardScaler()
numeric_cols = customer_features.select_dtypes(include=['number']).columns
customer_features[numeric_cols] = scaler.fit_transform(customer_features[numeric_cols])

In [29]:
# Calculate Cosine Similarity
feature_matrix = customer_features.select_dtypes(include=['number'])

In [30]:
# Compute the cosine similarity
similarity_matrix = cosine_similarity(feature_matrix)

In [31]:
# Recommendations for the first 20 customers
lookalike_results = {}
for i, cust_id in enumerate(customer_features['CustomerID'][:20]):
    # Get similarity scores for the current customer
    scores = list(enumerate(similarity_matrix[i]))
    # Sort by similarity score, exclude self (score=1.0), and take top 3
    top_similar = sorted(scores, key=lambda x: x[1], reverse=True)[1:4]
    # Format recommendations
    lookalike_results[cust_id] = [
        {"cust_id": customer_features['CustomerID'][idx], "score": round(score, 2)}
        for idx, score in top_similar
    ]

In [32]:
lookalike_df = pd.DataFrame([
    {"cust_id": cust_id, "lookalikes": lookalikes}
    for cust_id, lookalikes in lookalike_results.items()
])


In [34]:
lookalike_df.to_csv("Lookalike.csv", index=False)

In [36]:
print("Lookalike Results for First 20 Customers:")
print("Lookalike Model Complete. Recommendations saved to Lookalike.csv.")

Lookalike Results for First 20 Customers:
Lookalike Model Complete. Recommendations saved to Lookalike.csv.
