1. Import Libraries and Load Data

In [15]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

# Load the datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

# Preview the data
print(customers.head())
print(products.head())
print(transactions.head())

  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3       

2. Data Preparation

In [16]:
# Merge transactions with products to get product details
transactions = transactions.merge(products, on="ProductID", how="left")

# Merge transactions with customers to get customer profiles
data = transactions.merge(customers, on="CustomerID", how="left")

# Preview the merged data
print(data.head())

  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue  Price_x                      ProductName     Category  Price_y  \
0      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
1      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
2      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
3      601.36   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
4      902.04   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   

      CustomerName         Region  SignupDate  
0   Andrea Jenkins         Europe  202

3. Feature Engineering

In [17]:
# Aggregating transaction history for each customer
customer_features = data.groupby("CustomerID").agg({
    "TotalValue": "sum",                   # Total spend
    "TransactionID": "count",              # Number of transactions
    "Category": lambda x: x.mode()[0],     # Most frequent category
    "Price_y": "mean",                       # Average product price
}).reset_index()

# Merge customer data with region and signup information
customer_features = customer_features.merge(customers, on="CustomerID", how="left")

# Convert SignupDate to number of days since signup
customer_features['SignupDate'] = pd.to_datetime(customer_features['SignupDate'])
customer_features['DaysSinceSignup'] = (pd.Timestamp.now() - customer_features['SignupDate']).dt.days

# Drop unnecessary columns
customer_features = customer_features.drop(columns=["SignupDate"])

# Encode categorical features (Region, Category)
customer_features = pd.get_dummies(customer_features, columns=["Region", "Category"], drop_first=True)

# Preview the processed customer features
print(customer_features.head())

  CustomerID  TotalValue  TransactionID     Price_y        CustomerName  \
0      C0001     3354.52              5  278.334000    Lawrence Carroll   
1      C0002     1862.74              4  208.920000      Elizabeth Lutz   
2      C0003     2725.38              4  195.707500      Michael Rivera   
3      C0004     5354.88              8  240.636250  Kathleen Rodriguez   
4      C0005     2034.24              3  291.603333         Laura Weber   

   DaysSinceSignup  Region_Europe  Region_North America  Region_South America  \
0              932          False                 False                  True   
1             1079          False                 False                 False   
2              326          False                 False                  True   
3              841          False                 False                  True   
4              896          False                 False                 False   

   Category_Clothing  Category_Electronics  Category_Home Deco

4. Normalize Features

In [18]:
# Select numeric columns for normalization
numeric_cols = customer_features.select_dtypes(include=[np.number]).columns

# Normalize features
scaler = MinMaxScaler()
customer_features[numeric_cols] = scaler.fit_transform(customer_features[numeric_cols])

# Preview normalized features
print(customer_features.head())

  CustomerID  TotalValue  TransactionID   Price_y        CustomerName  \
0      C0001    0.308942            0.4  0.519414    Lawrence Carroll   
1      C0002    0.168095            0.3  0.367384      Elizabeth Lutz   
2      C0003    0.249541            0.3  0.338446      Michael Rivera   
3      C0004    0.497806            0.7  0.436848  Kathleen Rodriguez   
4      C0005    0.184287            0.2  0.548476         Laura Weber   

   DaysSinceSignup  Region_Europe  Region_North America  Region_South America  \
0         0.842204          False                 False                  True   
1         0.979458          False                 False                 False   
2         0.276377          False                 False                  True   
3         0.757236          False                 False                  True   
4         0.808590          False                 False                 False   

   Category_Clothing  Category_Electronics  Category_Home Decor  
0       

5. Compute Similarity and Generate Recommendations

In [19]:
# Compute cosine similarity
similarity_matrix = cosine_similarity(customer_features[numeric_cols])
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])

# Generate recommendations for the first 20 customers
lookalike_map = {}
for customer in customer_features['CustomerID'][:20]:
    # Sort customers by similarity score (excluding the customer themselves)
    similar_customers = similarity_df[customer].sort_values(ascending=False)[1:4]
    # Save results in the map
    lookalike_map[customer] = list(zip(similar_customers.index, similar_customers.values))

# Preview lookalike map
print(lookalike_map)

{'C0001': [('C0118', 0.9990130770771567), ('C0086', 0.9989006986961401), ('C0027', 0.9985506037811228)], 'C0002': [('C0029', 0.9997461168235259), ('C0025', 0.9991195354649179), ('C0071', 0.9978931181971934)], 'C0003': [('C0155', 0.9992208439543775), ('C0022', 0.9979234640638094), ('C0067', 0.9978503910881186)], 'C0004': [('C0175', 0.998742182825167), ('C0169', 0.9983889601509128), ('C0173', 0.9971276130655868)], 'C0005': [('C0131', 0.9996674875618617), ('C0132', 0.9984078821465742), ('C0007', 0.9972453857440968)], 'C0006': [('C0026', 0.993392906623897), ('C0129', 0.9928944983800732), ('C0016', 0.9926021207453927)], 'C0007': [('C0074', 0.9991569773188048), ('C0120', 0.9977576771953769), ('C0132', 0.9976219026354066)], 'C0008': [('C0017', 0.987542008174566), ('C0156', 0.983257513533744), ('C0194', 0.9804741295898928)], 'C0009': [('C0042', 0.990632667291045), ('C0186', 0.9905278994134156), ('C0128', 0.9885222817192333)], 'C0010': [('C0038', 0.9991272518426463), ('C0134', 0.997718084982142

6. Save Recommendations to Lookalike.csv

In [20]:
# Convert lookalike map to a DataFrame
lookalike_list = [{"CustomerID": k, "Lookalikes": v} for k, v in lookalike_map.items()]
lookalike_df = pd.DataFrame(lookalike_list)

# Save to CSV
lookalike_df.to_csv("Lookalike.csv", index=False)

print("Lookalike recommendations saved to Lookalike.csv!")

Lookalike recommendations saved to Lookalike.csv!
