In [7]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Load data
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

# Merge datasets
merged_data = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")

# Feature engineering: Aggregate transaction data for each customer
customer_features = merged_data.groupby("CustomerID").agg({
    "TotalValue": "sum",         # Total spending
    "ProductID": "nunique",      # Number of unique products purchased
    "Category": lambda x: x.mode()[0]  # Most common product category
}).reset_index()

# Identify numerical and categorical columns
numerical_cols = ['TotalValue', 'ProductID']
categorical_cols = ['Category']

# Preprocessing: One-hot encode categorical features and scale numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),  # Scale numerical features
        ('cat', OneHotEncoder(), categorical_cols)  # Encode categorical features
    ]
)

# Transform features
transformed_features = preprocessor.fit_transform(customer_features)

# Extract feature names (Compatible with older versions of scikit-learn)
if hasattr(preprocessor.named_transformers_['cat'], 'get_feature_names'):
    cat_feature_names = preprocessor.named_transformers_['cat'].get_feature_names(categorical_cols)
else:
    cat_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols)

columns = numerical_cols + list(cat_feature_names)

# Convert to DataFrame
import numpy as np
normalized_features = pd.DataFrame(transformed_features, columns=columns)

# Compute similarity matrix
similarity_matrix = cosine_similarity(normalized_features)

# Generate recommendations for the first 20 customers
recommendations = {}
for idx, customer_id in enumerate(customer_features["CustomerID"][:20]):  # First 20 customers
    # Get the indices of the 3 most similar customers
    similar_indices = similarity_matrix[idx].argsort()[::-1][1:4]  # Skip the customer itself
    # Store recommendations with similarity scores
    recommendations[customer_id] = [
        (customer_features["CustomerID"][i], similarity_matrix[idx][i]) for i in similar_indices
    ]

# Convert recommendations to a DataFrame for export
lookalike_df = pd.DataFrame.from_dict(recommendations, orient="index", columns=["Similar1", "Similar2", "Similar3"])
lookalike_df.reset_index(inplace=True)
lookalike_df.rename(columns={"index": "CustomerID"}, inplace=True)

# Save recommendations to Lookalike.csv
lookalike_df.to_csv("Shruti_Pande_Lookalike.csv", index=False)

print("Shruti_Pande_Lookalike.csv has been created with the top 3 recommendations for the first 20 customers.")


Shruti_Pande_Lookalike.csv has been created with the top 3 recommendations for the first 20 customers.
