In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Accessing the data from the Google Drive links
customers_link = "https://drive.google.com/uc?export=download&id=1bu_--mo79VdUG9oin4ybfFGRUSXAe-WE"
products_link = "https://drive.google.com/uc?export=download&id=1IKuDizVapw-hyktwfpoAoaGtHtTNHfd0"
transactions_link = "https://drive.google.com/uc?export=download&id=1saEqdbBB-vuk2hxoAf4TzDEsykdKlzbF"

# Loading the data directly into pandas DataFrames
customers = pd.read_csv(customers_link)
products = pd.read_csv(products_link)
transactions = pd.read_csv(transactions_link)

# Checking the structure of the data
print("Customers Data:")
print(customers.head())

print("\nProducts Data:")
print(products.head())

print("\nTransactions Data:")
print(transactions.head())


Customers Data:
  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15

Products Data:
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31

Transactions Data:
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127 

In [3]:
# Converting the 'SignupDate' column to datetime format
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
# Converting the 'TransactionDate' column in the 'transactions' DataFrame to datetime
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])
# Merging datasets
# Merging datasets
merged_data = transactions.merge(customers, on='CustomerID', how='left') \
                          .merge(products, on='ProductID', how='left', suffixes=('_Transaction', '_Product'))

# Now the 'Price' columns from each dataset will be renamed as 'Price_Transaction' and 'Price_Product'

In [5]:
# Importing necessary libraries for similarity calculation and normalization
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

# Feature engineering with the correct column names
customer_features = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',  # Total spending by each customer
    'Quantity': 'sum',  # Total quantity of items purchased by each customer
    'Price_Product': 'mean',  # Average product price from Products dataset
    'ProductID': 'nunique'  # Number of unique products bought by each customer
}).reset_index()

# Renaming columns for better clarity
customer_features.columns = ['CustomerID', 'TotalSpent', 'TotalQuantity', 'AvgPrice', 'UniqueProducts']

# Normalizing the customer features using Min-Max scaling
scaler = MinMaxScaler()  # Creating a scaler to normalize the data
features_scaled = scaler.fit_transform(customer_features.iloc[:, 1:])  # Normalizing the feature columns

# Computing the cosine similarity between customers based on their feature vectors
similarity_matrix = cosine_similarity(features_scaled)  # Creating a similarity matrix
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])  # Converting to DataFrame for easier handling

# Creating a dictionary to store top 3 lookalikes for the first 20 customers
lookalike_dict = {}
for customer in customer_features['CustomerID'][:20]:
    # Sorting the similarities for each customer and selecting top 3 most similar customers
    similar_customers = similarity_df[customer].sort_values(ascending=False)[1:4]  # Excluding self (top result)
    lookalike_dict[customer] = list(zip(similar_customers.index, similar_customers.values))  # Storing lookalikes

# Saving the results (CustomerID and their top 3 lookalikes) to a CSV file
lookalike_df = pd.DataFrame({
    "CustomerID": lookalike_dict.keys(),
    "Lookalikes": [str(v) for v in lookalike_dict.values()]  # Converting lookalike tuples to string for easy display
})
lookalike_df.to_csv("FirstName_LastName_Lookalike.csv", index=False)  # Saving the final DataFrame to CSV
