In [6]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Load datasets
customers = pd.read_csv("/content/Customers.csv")
products = pd.read_csv("/content/Products.csv")
transactions = pd.read_csv("/content/Transactions.csv")



In [7]:
# Merge datasets: Merge transactions with customer and product data
merged_data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')



In [8]:
# Renaming columns to avoid ambiguity
merged_data.rename(columns={'Price_x': 'TransactionPrice', 'Price_y': 'ProductPrice'}, inplace=True)



In [9]:
# Feature engineering: Aggregate customer transaction data
customer_profile = (
    merged_data.groupby('CustomerID')
    .agg({
        'TotalValue': 'sum',  # Total revenue generated by the customer
        'Quantity': 'sum',  # Total quantity purchased
        'ProductPrice': 'mean',  # Average product price purchased
        'Category': lambda x: ' '.join(x)  # Aggregate product categories purchased
    })
    .reset_index()
)



In [10]:
# Create TF-IDF vector for product categories
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(customer_profile['Category'])



In [11]:
# Combine numerical features (TotalValue, Quantity, ProductPrice) with TF-IDF features
scaler = StandardScaler()
numerical_features = customer_profile[['TotalValue', 'Quantity', 'ProductPrice']].values
numerical_features_scaled = scaler.fit_transform(numerical_features)
combined_features = np.hstack((numerical_features_scaled, tfidf_matrix.toarray()))



In [12]:
# Compute cosine similarity between customers
similarity_matrix = cosine_similarity(combined_features)



In [13]:
# Generate lookalike recommendations for the first 20 customers (C0001 to C0020)
lookalike_results = {}
for i, customer_id in enumerate(customer_profile['CustomerID'][:20]):
    # Get indices of top 3 most similar customers (excluding self)
    similar_indices = similarity_matrix[i].argsort()[-4:-1][::-1]  # Top 3 most similar customers
    similar_customers = [
        (customer_profile['CustomerID'].iloc[j], similarity_matrix[i][j])
        for j in similar_indices
    ]
    lookalike_results[customer_id] = similar_customers



In [14]:
# Convert lookalike results to DataFrame and save to Lookalike.csv
lookalike_data = []
for cust_id, similar_list in lookalike_results.items():
    for sim_cust_id, score in similar_list:
        lookalike_data.append({'CustomerID': cust_id, 'SimilarCustomerID': sim_cust_id, 'Score': score})

lookalike_df = pd.DataFrame(lookalike_data)
lookalike_df.to_csv('Rohan_Duppala_Lookalike.csv', index=False)

