In [6]:
!pip install scikit-learn



In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
# Load datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

In [9]:
# Merge transactions with product details
transactions = transactions.merge(products, on="ProductID", how="left")

In [10]:
# Aggregate transaction data per customer
customer_transactions = transactions.groupby("CustomerID").agg(
    total_spent=("TotalValue", "sum"),
    total_transactions=("TransactionID", "count"),
    avg_spent_per_trans=("TotalValue", "mean")
).reset_index()

In [11]:
# Extract product preference (most purchased category per customer)
top_category = transactions.groupby(["CustomerID", "Category"]).size().reset_index(name='count')
top_category = top_category.loc[top_category.groupby("CustomerID")["count"].idxmax(), ["CustomerID", "Category"]]

In [12]:
# Merge with customer profile
customers = customers.merge(customer_transactions, on="CustomerID", how="left")
customers = customers.merge(top_category, on="CustomerID", how="left")

In [13]:
# Encode categorical features (Region, Category)
encoder = LabelEncoder()
customers["Region"] = encoder.fit_transform(customers["Region"])
customers["Category"] = encoder.fit_transform(customers["Category"])

In [14]:
# Normalize numerical features
scaler = StandardScaler()
customers[["total_spent", "total_transactions", "avg_spent_per_trans"]] = scaler.fit_transform(
    customers[["total_spent", "total_transactions", "avg_spent_per_trans"]]
)

In [16]:
# Handle missing values (drop rows with NaN values)
customers = customers.dropna()

In [17]:
# Compute similarity using cosine similarity
customer_features = customers[["Region", "total_spent", "total_transactions", "avg_spent_per_trans", "Category"]]
similarity_matrix = cosine_similarity(customer_features)

In [18]:
# Convert to DataFrame
similarity_df = pd.DataFrame(similarity_matrix, index=customers["CustomerID"], columns=customers["CustomerID"])

In [21]:
# Get top 3 similar customers for first 20 customers
lookalike_results = {}

for customer in customers["CustomerID"][:20]:
    similar_customers = similarity_df[customer].drop(customer).nlargest(3)
    lookalike_results[customer] = list(zip(similar_customers.index, similar_customers.values))

# Convert to required format and save
lookalike_df = pd.DataFrame(list(lookalike_results.items()), columns=["cust_id", "lookalikes"])
lookalike_df.to_csv("Vepuri_Richitha_Lookalike.csv", index=False)

print("Vepuri_Richitha_Lookalike.csv created successfully!")

Vepuri_Richitha_Lookalike.csv created successfully!
