# Loading Libraries and Data

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import numpy as np

# Load datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Combine data
data = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")

# Feature Engineering

In [2]:
# Total spending per customer
total_spending = data.groupby("CustomerID")["TotalValue"].sum().rename("TotalSpending")

In [3]:
# Total number of transactions per customer
transaction_count = data.groupby("CustomerID")["TransactionID"].nunique().rename("TransactionCount")

In [4]:
# Most popular product category per customer
popular_category = data.groupby(["CustomerID", "Category"])["Quantity"].sum().unstack(fill_value=0)

In [5]:
# Region encoding
region_encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
region_encoded = pd.DataFrame(
    region_encoder.fit_transform(customers[["Region"]]),
    index=customers["CustomerID"],
    columns=region_encoder.get_feature_names_out(["Region"])
)

In [6]:
# Combine all features into a single DataFrame
customer_features = pd.concat([total_spending, transaction_count, popular_category, region_encoded], axis=1).fillna(0)

# Normalizing features

In [7]:
scaler = StandardScaler()
normalized_features = scaler.fit_transform(customer_features)

## Testing Different LookAlike models

In [8]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances

# 1. Cosine Similarity
cosine_sim_matrix = cosine_similarity(normalized_features)

# 2. Euclidean Distance (convert to similarity by inverting distances)
euclidean_dist_matrix = euclidean_distances(normalized_features)
euclidean_sim_matrix = 1 / (1 + euclidean_dist_matrix)  # Adding 1 to avoid division by zero

# 3. Manhattan Distance (convert to similarity)
manhattan_dist_matrix = manhattan_distances(normalized_features)
manhattan_sim_matrix = 1 / (1 + manhattan_dist_matrix)

# Clustering and Validation
similarity_matrices = {
    "Cosine Similarity": cosine_sim_matrix,
    "Euclidean Similarity": euclidean_sim_matrix,
    "Manhattan Similarity": manhattan_sim_matrix
}

# Number of clusters for validation 
num_clusters = 5
cluster_validation_scores = {}

# Function to set the diagonal of a matrix to zero
def zero_diagonal(matrix):
    np.fill_diagonal(matrix, 0)
    return matrix

def similarity_to_distance(matrix):
    return 1 - matrix

# Validation Loop
for sim_name, sim_matrix in similarity_matrices.items():
    # Convert similarity to distance
    sim_matrix_no_diag = similarity_to_distance(sim_matrix.copy())
    np.fill_diagonal(sim_matrix_no_diag, 0)  # diagonal is zero
    
    # Use KMeans clustering
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(sim_matrix_no_diag)

    # Calculate silhouette score
    silhouette_avg = silhouette_score(sim_matrix_no_diag, cluster_labels, metric="precomputed")
    cluster_validation_scores[sim_name] = silhouette_avg
    print(f"Silhouette Score for {sim_name}: {silhouette_avg}")




Silhouette Score for Cosine Similarity: 0.4400647765397279
Silhouette Score for Euclidean Similarity: 0.0927872489133819




Silhouette Score for Manhattan Similarity: 0.06782067957500837


In [9]:
best_method = max(cluster_validation_scores, key=cluster_validation_scores.get)
print(f"\nBest Similarity Method based on Silhouette Score: {best_method}")


Best Similarity Method based on Silhouette Score: Cosine Similarity


## Cosine Similarity works best, using that as Lookalike Model

In [10]:
# Compute similarity scores
similarity_matrix = cosine_similarity(normalized_features)

similarity_df = pd.DataFrame(similarity_matrix, index=customer_features.index, columns=customer_features.index)

# Generate Lookalike Recommendations
lookalike_results = {}
for cust_id in customers["CustomerID"][:20]:  # First 20 customers
    similar_customers = similarity_df[cust_id].drop(cust_id).sort_values(ascending=False).head(3)
    lookalike_results[cust_id] = [(other_cust, round(score, 2)) for other_cust, score in similar_customers.items()]

# Save the results as Lookalike.csv
lookalike_df = pd.DataFrame({
    "CustomerID": lookalike_results.keys(),
    "Lookalikes": [str(lookalike_results[cust_id]) for cust_id in lookalike_results.keys()]
})
lookalike_df.to_csv("Lookalike.csv", index=False)

print("Lookalike.csv has been created successfully.")

Lookalike.csv has been created successfully.
