In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
customer_data = pd.read_csv('./data/Customers.csv')
product_data = pd.read_csv('./data/Products.csv')
transaction_data = pd.read_csv('./data/Transactions.csv')

In [17]:
#Mege Datas
data = transaction_data.merge(customer_data, on='CustomerID').merge(product_data, on='ProductID')

data.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68


In [9]:
#Create Customer Profiles

# Aggregate transaction data to create customer profiles
customer_profiles = data.groupby("CustomerID").agg({
    "TotalValue": "sum",#total spending
    "Quantity": "sum",#total quantity purchased
    "Category": lambda x: x.mode()[0],#most purchased Category
}).reset_index()

# customer_profiles.head()

# Add region data from customers.csv
customer_profiles = customer_profiles.merge(customer_data[["CustomerID", "Region"]], on="CustomerID")

customer_profiles.head()

Unnamed: 0,CustomerID,TotalValue,Quantity,Category,Region
0,C0001,3354.52,12,Electronics,South America
1,C0002,1862.74,10,Clothing,Asia
2,C0003,2725.38,14,Home Decor,South America
3,C0004,5354.88,23,Books,South America
4,C0005,2034.24,7,Electronics,Asia


In [13]:
#feature encoding
from sklearn.preprocessing import OneHotEncoder

# One-hot encode the Category and Region columns
encoder = OneHotEncoder()
encoded_features = encoder.fit_transform(customer_profiles[["Category", "Region"]]).toarray()

# print(encoded_features)

# Combine encoded features with numerical data
numerical_features = customer_profiles[["TotalValue", "Quantity"]].values

# print(numerical_features)
features = np.hstack((numerical_features, encoded_features))
# print(features)
print("Feature shape:", features.shape)


Feature shape: (199, 10)


In [15]:
#Calculate Similarity
from sklearn.metrics.pairwise import cosine_similarity

# Calculate similarity matrix
similarity_matrix = cosine_similarity(features)

# Convert similarity matrix into a DataFrame for easier handling
similarity_df = pd.DataFrame(similarity_matrix, index=customer_profiles["CustomerID"], columns=customer_profiles["CustomerID"])

similarity_df.head()

CustomerID,C0001,C0002,C0003,C0004,C0005,C0006,C0007,C0008,C0009,C0010,...,C0191,C0192,C0193,C0194,C0195,C0196,C0197,C0198,C0199,C0200
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0001,1.0,0.999998,0.999999,1.0,1.0,1.0,1.0,0.999999,0.999999,0.999994,...,1.0,1.0,1.0,1.0,1.0,0.999999,0.999999,0.999999,0.999999,1.0
C0002,0.999998,1.0,1.0,0.999999,0.999998,0.999996,0.999997,0.999999,0.999997,0.999998,...,0.999998,0.999998,0.999998,0.999998,0.999999,0.999995,0.999999,0.999997,0.999999,0.999998
C0003,0.999999,1.0,1.0,1.0,0.999998,0.999997,0.999998,1.0,0.999997,0.999998,...,0.999999,0.999999,0.999999,0.999999,1.0,0.999996,0.999999,0.999997,0.999999,0.999998
C0004,1.0,0.999999,1.0,1.0,0.999999,0.999999,0.999999,1.0,0.999998,0.999996,...,1.0,1.0,1.0,1.0,1.0,0.999998,1.0,0.999998,1.0,0.999999
C0005,1.0,0.999998,0.999998,0.999999,1.0,1.0,1.0,0.999999,0.999999,0.999993,...,1.0,1.0,1.0,1.0,0.999999,0.999999,0.999999,0.999999,0.999999,1.0


In [16]:
#generate recommendations

# Function to get top 3 similar customers for a given customer
def get_top_similar_customers(customer_id, similarity_df, top_n=3):
    similar_customers = similarity_df[customer_id].sort_values(ascending=False).iloc[1:top_n+1]
    return list(zip(similar_customers.index, similar_customers.values))
    

# Generate lookalike recommendations for the first 20 customers
lookalikes = {}
for customer_id in customer_profiles["CustomerID"][:20]:
    lookalikes[customer_id] = get_top_similar_customers(customer_id, similarity_df)


# Convert lookalikes into a DataFrame
lookalike_df = pd.DataFrame({
    "CustomerID": lookalikes.keys(),
    "Lookalikes": [str(v) for v in lookalikes.values()]
})


# Save to CSV
lookalike_df.to_csv("Lookalike.csv", index=False)

print("Lookalike recommendations saved to 'Lookalike.csv'")


Lookalike recommendations saved to 'Lookalike.csv'
