In [21]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [22]:
# Load the datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

print(customers.head(5))
print('*'*100)
print(products.head(5))
print('*'*100)
print(transactions.head(5))

  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15
****************************************************************************************************
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31
****************************************************************************************************
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001    

In [23]:
print(customers.info())
print(products.info())
print(transactions.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   CustomerID    200 non-null    object
 1   CustomerName  200 non-null    object
 2   Region        200 non-null    object
 3   SignupDate    200 non-null    object
dtypes: object(4)
memory usage: 6.4+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   ProductID    100 non-null    object 
 1   ProductName  100 non-null    object 
 2   Category     100 non-null    object 
 3   Price        100 non-null    float64
dtypes: float64(1), object(3)
memory usage: 3.3+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------

In [24]:

# Merge datasets
transactions = transactions.merge(products, on="ProductID", how="left")
customer_data = transactions.merge(customers, on="CustomerID", how="left")
print(customer_data.head(5))

  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue  Price_x                      ProductName     Category  Price_y  \
0      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
1      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
2      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
3      601.36   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
4      902.04   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   

      CustomerName         Region  SignupDate  
0   Andrea Jenkins         Europe  202

In [25]:
# Feature Engineering
# Aggregate numerical features
customer_profiles = customer_data.groupby("CustomerID").agg(
    TotalSpending=("TotalValue", "sum"),
    AvgTransactionValue=("TotalValue", "mean"),
    TotalQuantity=("Quantity", "sum"),
    UniqueProducts=("ProductID", "nunique"),
).reset_index()




In [26]:
# Normalization
scaler = MinMaxScaler()
normalized_features = pd.DataFrame(
    scaler.fit_transform(customer_profiles.iloc[:, 1:-1]),
    columns=customer_profiles.columns[1:-1]
)
normalized_features["CustomerID"] = customer_profiles["CustomerID"]



In [27]:
# Compute Similarity
similarity_matrix = {}
for i, customer in normalized_features.iterrows():
    scores = []

    for j, other_customer in normalized_features.iterrows():
        if customer["CustomerID"] != other_customer["CustomerID"]:
                                                                                      
            distance = np.linalg.norm(customer[:-1] - other_customer[:-1])            # Calculate Euclidean Distance (or other metric)
            similarity = 1 / (1 + distance)                                           # Convert to similarity score
            scores.append((other_customer["CustomerID"], similarity))
    
    
    top_3 = sorted(scores, key=lambda x: x[1], reverse=True)[:3]                      # Sort by similarity and get top 3
    similarity_matrix[customer["CustomerID"]] = top_3

In [28]:
# Create Lookalike.csv
lookalike_data = []
for cust_id, top_3 in similarity_matrix.items():
    row = [cust_id]
    for similar_cust, score in top_3:
        row.append(similar_cust)
        row.append(score)
    lookalike_data.append(row)

lookalike_df = pd.DataFrame(
    lookalike_data,
    columns=["CustomerID", "Lookalike1", "Score1", "Lookalike2", "Score2", "Lookalike3", "Score3"]
)
lookalike_df.to_csv("Savitha_Lookalike.csv", index=False)

print("Lookalike.csv file created successfully!")

Lookalike.csv file created successfully!
