In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Load datasets
transactions_df = pd.read_csv('Transactions.csv')
products_df = pd.read_csv('Products.csv')
customers_df = pd.read_csv('Customers.csv')


In [4]:
#merging datasets for analysis
merged_df = transactions_df.merge(products_df, on="ProductID").merge(customers_df, on="CustomerID")

In [5]:
print(merged_df)

    TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0          T00001      C0199      P067  2024-08-25 12:38:23         1   
1          T00112      C0146      P067  2024-05-27 22:23:54         1   
2          T00166      C0127      P067  2024-04-25 07:38:55         1   
3          T00272      C0087      P067  2024-03-26 22:55:37         2   
4          T00363      C0070      P067  2024-03-21 15:10:10         3   
..            ...        ...       ...                  ...       ...   
995        T00496      C0118      P037  2024-10-24 08:30:27         1   
996        T00759      C0059      P037  2024-06-04 02:15:24         3   
997        T00922      C0018      P037  2024-04-05 13:05:32         4   
998        T00959      C0115      P037  2024-09-29 10:16:02         2   
999        T00992      C0024      P037  2024-04-21 10:52:24         1   

     TotalValue  Price_x                      ProductName     Category  \
0        300.68   300.68  ComfortLiving Bluetooth

In [7]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   TransactionID    1000 non-null   object 
 1   CustomerID       1000 non-null   object 
 2   ProductID        1000 non-null   object 
 3   TransactionDate  1000 non-null   object 
 4   Quantity         1000 non-null   int64  
 5   TotalValue       1000 non-null   float64
 6   Price_x          1000 non-null   float64
 7   ProductName      1000 non-null   object 
 8   Category         1000 non-null   object 
 9   Price_y          1000 non-null   float64
 10  CustomerName     1000 non-null   object 
 11  Region           1000 non-null   object 
 12  SignupDate       1000 non-null   object 
dtypes: float64(3), int64(1), object(9)
memory usage: 101.7+ KB


In [8]:
merged_df.isnull().sum()

Unnamed: 0,0
TransactionID,0
CustomerID,0
ProductID,0
TransactionDate,0
Quantity,0
TotalValue,0
Price_x,0
ProductName,0
Category,0
Price_y,0


In [9]:
merged_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Quantity,1000.0,2.537,1.117981,1.0,2.0,3.0,4.0,4.0
TotalValue,1000.0,689.99556,493.144478,16.08,295.295,588.88,1011.66,1991.04
Price_x,1000.0,272.55407,140.73639,16.08,147.95,299.93,404.4,497.76
Price_y,1000.0,272.55407,140.73639,16.08,147.95,299.93,404.4,497.76


In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [10]:
# Prepare customer profile dataset
customer_profile = merged_df.groupby("CustomerID").agg({
    "Region": "first",
    "TotalValue": "sum",
    "Quantity": "sum",
    "Category": lambda x: x.mode()[0]
}).reset_index()

In [12]:
 #Encode categorical features
customer_profile = pd.get_dummies(customer_profile, columns=["Region", "Category"], drop_first=True)

In [13]:
# Standardize numerical columns
scaler = StandardScaler()
customer_profile[["TotalValue", "Quantity"]] = scaler.fit_transform(customer_profile[["TotalValue", "Quantity"]])


In [17]:
# Evaluation of Lookalike Model
def evaluate_lookalike_model(input_customer_id, customer_profile, num_recommendations=3):
    # Get lookalike recommendations
    recommendations = get_lookalikes(input_customer_id, num_recommendations)

    print(f"Input Customer ID: {input_customer_id}")
    print(f"Top {num_recommendations} Recommendations:")
    print(recommendations)

    # Validate recommendations
    input_customer = customer_profile[customer_profile["CustomerID"] == input_customer_id]
    print("\nInput Customer Profile:")
    print(input_customer)

    for index, row in recommendations.iterrows():
        recommended_customer = customer_profile[customer_profile["CustomerID"] == row["CustomerID"]]
        print(f"\nRecommended Customer ID: {row['CustomerID']}")
        print("Profile Comparison:")
        print(recommended_customer)
        print(f"Similarity Score: {row['Similarity']:.2f}")
        print("-" * 50)

    # Check similarity trends
    scores = recommendations["Similarity"].values
    print("\nSimilarity Scores of Recommendations:", scores)
    if all(scores[i] >= scores[i + 1] for i in range(len(scores) - 1)):
        print("✅ Similarity scores decrease logically with rank.")
    else:
        print("⚠️ Similarity scores do not decrease logically.")


In [19]:
# Evaluation metrics
evaluate_lookalike_model("C0001", customer_profile, num_recommendations=3)

Input Customer ID: C0001
Top 3 Recommendations:
    CustomerID  Similarity
182      C0184    0.995605
47       C0048    0.987797
188      C0190    0.975670

Input Customer Profile:
  CustomerID  TotalValue  Quantity  Region_Europe  Region_North America  \
0      C0001   -0.061701 -0.122033          False                 False   

   Region_South America  Category_Clothing  Category_Electronics  \
0                  True              False                  True   

   Category_Home Decor  Similarity  
0                False         1.0  

Recommended Customer ID: C0184
Profile Comparison:
    CustomerID  TotalValue  Quantity  Region_Europe  Region_North America  \
182      C0184   -0.040553 -0.285017          False                 False   

     Region_South America  Category_Clothing  Category_Electronics  \
182                  True              False                  True   

     Category_Home Decor  Similarity  
182                False    0.995605  
Similarity Score: 1.00
--------

In [22]:
customer_profile.to_csv('final_data.csv', index=False)
customer_profile.head()

Unnamed: 0,CustomerID,TotalValue,Quantity,Region_Europe,Region_North America,Region_South America,Category_Clothing,Category_Electronics,Category_Home Decor,Similarity
0,C0001,-0.061701,-0.122033,False,False,True,False,True,False,1.0
1,C0002,-0.877744,-0.448,False,False,False,True,False,False,0.074022
2,C0003,-0.405857,0.203934,False,False,True,False,False,True,0.566118
3,C0004,1.032547,1.670787,False,False,True,False,False,False,0.256176
4,C0005,-0.783929,-0.936951,False,False,False,False,True,False,0.600035
