In [1]:
import pandas as pd

# Load the datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

# Merge datasets
transactions_with_customers = transactions.merge(customers, on="CustomerID", how="inner")
transactions_with_products = transactions_with_customers.merge(products, on="ProductID", how="inner")

# Display the merged dataset
print(transactions_with_products.head())

  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue  Price_x     CustomerName         Region  SignupDate  \
0      300.68   300.68   Andrea Jenkins         Europe  2022-12-03   
1      300.68   300.68  Brittany Harvey           Asia  2024-09-04   
2      300.68   300.68  Kathryn Stevens         Europe  2024-04-04   
3      601.36   300.68  Travis Campbell  South America  2024-04-11   
4      902.04   300.68    Timothy Perez         Europe  2022-03-15   

                       ProductName     Category  Price_y  
0  ComfortLiving Bluetooth Speaker  Electronics   300.68  
1  ComfortLiving Bluetooth Speaker

In [2]:
# Total transaction value per customer
customer_features = transactions_with_products.groupby('CustomerID').agg(
    total_value=('TotalValue', 'sum'),
    avg_transaction_value=('TotalValue', 'mean'),
    purchase_count=('TransactionID', 'count'),
    favorite_category=('Category', lambda x: x.mode()[0])
).reset_index()

# Encode categorical data (e.g., favorite_category)
customer_features = pd.get_dummies(customer_features, columns=['favorite_category'], drop_first=True)

print(customer_features.head())

  CustomerID  total_value  avg_transaction_value  purchase_count  \
0      C0001      3354.52                670.904               5   
1      C0002      1862.74                465.685               4   
2      C0003      2725.38                681.345               4   
3      C0004      5354.88                669.360               8   
4      C0005      2034.24                678.080               3   

   favorite_category_Clothing  favorite_category_Electronics  \
0                       False                           True   
1                        True                          False   
2                       False                          False   
3                       False                          False   
4                       False                           True   

   favorite_category_Home Decor  
0                         False  
1                         False  
2                          True  
3                         False  
4                         False  


In [3]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Normalize the feature values for better similarity calculation
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
normalized_features = scaler.fit_transform(customer_features.drop('CustomerID', axis=1))

# Compute similarity matrix
similarity_matrix = cosine_similarity(normalized_features)

# Create a DataFrame for easy lookup
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])
print(similarity_df.head())

CustomerID     C0001     C0002     C0003     C0004     C0005     C0006  \
CustomerID                                                               
C0001       1.000000  0.237587  0.297523  0.551643  0.982007  0.543664   
C0002       0.237587  1.000000  0.216776  0.407129  0.190187  0.389977   
C0003       0.297523  0.216776  1.000000  0.487448  0.251260  0.519291   
C0004       0.551643  0.407129  0.487448  1.000000  0.409711  0.853729   
C0005       0.982007  0.190187  0.251260  0.409711  1.000000  0.480756   

CustomerID     C0007     C0008     C0009     C0010  ...     C0191     C0192  \
CustomerID                                          ...                       
C0001       0.976909  0.353000  0.147746  0.226100  ...  0.568638  0.988990   
C0002       0.218185  0.268164  0.985878  0.999607  ...  0.418502  0.186913   
C0003       0.293457  0.893175  0.133818  0.205083  ...  0.517680  0.233008   
C0004       0.462965  0.683632  0.256351  0.391507  ...  0.978783  0.428656   
C0005  

In [4]:
def recommend_lookalikes(similarity_df, customer_id, top_n=3):
    # Get similarity scores for the given customer
    scores = similarity_df[customer_id].sort_values(ascending=False)
    
    # Exclude the customer itself
    scores = scores[scores.index != customer_id]
    
    # Get the top N similar customers
    return scores.head(top_n)

# Example: Get recommendations for CustomerID 'C0001'
recommended_customers = recommend_lookalikes(similarity_df, 'C0001', top_n=3)
print(recommended_customers)

CustomerID
C0072    0.998777
C0190    0.998657
C0048    0.997883
Name: C0001, dtype: float64


In [6]:
# Prepare lookalike recommendations for the first 20 customers
lookalike_results = {}
for customer_id in customer_features['CustomerID'][:20]:
    recommendations = recommend_lookalikes(similarity_df, customer_id)
    lookalike_results[customer_id] = recommendations.to_dict()

# Convert results to a DataFrame
lookalike_df = pd.DataFrame([
    {'cust_id': customer_id, 'lookalikes': str(recommendations)}
    for customer_id, recommendations in lookalike_results.items()
])

# Save to CSV
lookalike_df.to_csv("Smruthika_B_J_Lookalike.csv", index=False)
print("Smruthika_B_J_Lookalike.csv created!")


Smruthika_B_J_Lookalike.csv created!
