In [5]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Load datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

# Merge datasets for feature preparation
merged_data = transactions.merge(customers, on="CustomerID", how="left")
merged_data = merged_data.merge(products, on="ProductID", how="left")

# Check columns in merged data
print("Columns in merged_data:", merged_data.columns)

# Use 'Price_y' as the correct price column
agg_dict = {
    "TotalValue": "sum",
    "Quantity": "sum",
    "Price_y": "mean",  # Adjusted to use 'Price_y'
    "Region": lambda x: x.mode()[0],
    "Category": lambda x: x.mode()[0]
}

# Aggregate data by CustomerID
customer_features = merged_data.groupby("CustomerID").agg(agg_dict).reset_index()

# Rename 'Price_y' to 'Price' for clarity
customer_features.rename(columns={"Price_y": "Price"}, inplace=True)

# One-hot encode categorical variables
customer_features_encoded = pd.get_dummies(customer_features, columns=["Region", "Category"], drop_first=True)

# Scale numerical features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features_encoded.drop("CustomerID", axis=1))

# Compute similarity matrix
similarity_matrix = cosine_similarity(scaled_features)

# Create a DataFrame for the similarity matrix
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features["CustomerID"], columns=customer_features["CustomerID"])

# Generate top 3 lookalike customers for each of the first 20 customers
lookalike_map = {}
customer_ids = customers["CustomerID"].head(20)
for cust_id in customer_ids:
    similar_customers = similarity_df[cust_id].sort_values(ascending=False).iloc[1:4]  # Exclude self
    lookalike_map[cust_id] = list(zip(similar_customers.index, similar_customers.values))

# Save results to Lookalike.csv
lookalike_results = []
for cust_id, lookalikes in lookalike_map.items():
    for similar_id, score in lookalikes:
        lookalike_results.append({"cust_id": cust_id, "similar_cust_id": similar_id, "score": score})

lookalike_df = pd.DataFrame(lookalike_results)
lookalike_df.to_csv("Lookalike.csv", index=False)

print("Lookalike recommendations saved to Lookalike.csv")



Columns in merged_data: Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price_x', 'CustomerName', 'Region',
       'SignupDate', 'ProductName', 'Category', 'Price_y'],
      dtype='object')
Lookalike recommendations saved to Lookalike.csv
