In [89]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [91]:
customers = pd.read_csv(r"C:\Users\admin\Downloads\Customers.csv")
products = pd.read_csv(r"C:\Users\admin\Downloads\Products.csv")
transactions = pd.read_csv(r"C:\Users\admin\Downloads\Transactions.csv")

In [93]:
# Merging transactions with customer
data = transactions.merge(customers, on="CustomerID", how="left")
data = data.merge(products, on="ProductID", how="left")

In [95]:
# Aggregate transaction data per customer
customer_features = data.groupby("CustomerID").agg({
    "TotalValue": "sum",  # Total amount spent
    "Quantity": "sum",    # Total quantity purchased
    "ProductID": "count"  # Total transactions
}).rename(columns={"ProductID": "TransactionCount"})

In [97]:
customer_features

Unnamed: 0_level_0,TotalValue,Quantity,TransactionCount
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C0001,3354.52,12,5
C0002,1862.74,10,4
C0003,2725.38,14,4
C0004,5354.88,23,8
C0005,2034.24,7,3
...,...,...,...
C0196,4982.88,12,4
C0197,1928.65,9,3
C0198,931.83,3,2
C0199,1979.28,9,4


In [99]:
# Add customer profile information, including Region
customer_features = customer_features.merge(
    customers[["CustomerID", "Region"]].set_index("CustomerID"),
    left_index=True,
    right_index=True,
    how="left"
)
# Check if Region is successfully added
print(customer_features.head())

            TotalValue  Quantity  TransactionCount         Region
CustomerID                                                       
C0001          3354.52        12                 5  South America
C0002          1862.74        10                 4           Asia
C0003          2725.38        14                 4  South America
C0004          5354.88        23                 8  South America
C0005          2034.24         7                 3           Asia


In [103]:
from sklearn.preprocessing import LabelEncoder

In [105]:
label_encoder = LabelEncoder()

# label encoding to Region column
customer_features["Region"] = label_encoder.fit_transform(customer_features["Region"])

# View the encoded values
print(customer_features[["Region"]].head())

            Region
CustomerID        
C0001            3
C0002            0
C0003            3
C0004            3
C0005            0


In [109]:
columns_to_drop = ["CustomerName", "SignupDate"]
customer_features = customer_features.drop(
    columns=[col for col in columns_to_drop if col in customer_features.columns],
    errors="ignore"
)

In [111]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features)

In [113]:
similarity_matrix = cosine_similarity(scaled_features)

In [115]:
# Map customer IDs to indices
customer_ids = customer_features.index.tolist()
customer_idx_map = {idx: cust_id for idx, cust_id in enumerate(customer_ids)}

In [117]:
# Create the lookalike recommendations
lookalike_results = {}
for idx, cust_id in enumerate(customer_ids[:20]):  # For first 20 customers (C0001 - C0020)
    # Get similarity scores for the current customer
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    # Sort by similarity score in descending order, exclude the customer itself
    similar_customers = sorted(similarity_scores, key=lambda x: x[1], reverse=True)[1:4]
    # Store the top 3 similar customers with scores
    lookalike_results[cust_id] = [(customer_idx_map[i], round(score, 4)) for i, score in similar_customers]

In [119]:
#Lookalike.csv
lookalike_df = pd.DataFrame([
    {"CustomerID": cust_id, "Lookalikes": lookalikes} 
    for cust_id, lookalikes in lookalike_results.items()
])
lookalike_df.to_csv("Lookalike.csv", index=False)

In [121]:
lookalike_df.head()

Unnamed: 0,CustomerID,Lookalikes
0,C0001,"[(C0107, 0.993), (C0137, 0.9918), (C0191, 0.98..."
1,C0002,"[(C0142, 0.9911), (C0177, 0.9764), (C0088, 0.9..."
2,C0003,"[(C0190, 0.9403), (C0133, 0.9366), (C0174, 0.9..."
3,C0004,"[(C0113, 0.9945), (C0165, 0.9822), (C0102, 0.9..."
4,C0005,"[(C0186, 0.9979), (C0159, 0.9975), (C0007, 0.9..."
