In [2]:
import os


os.chdir("C:/eCommerce_Project")  # folder 
print(os.getcwd())  # Checking the working directory

C:\eCommerce_Project


In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

In [5]:
# Loading the datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

# Checking the first few rows of each dataset
print("Customers Data:")
print(customers.head())

print("\nProducts Data:")
print(products.head())

print("\nTransactions Data:")
print(transactions.head())

Customers Data:
  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15

Products Data:
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31

Transactions Data:
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127 

In [6]:
# Checking for missing values in the datasets
print(customers.isnull().sum())
print(products.isnull().sum())
print(transactions.isnull().sum())

# Checking for duplicate rows
print(customers.duplicated().sum())
print(products.duplicated().sum())
print(transactions.duplicated().sum())

CustomerID      0
CustomerName    0
Region          0
SignupDate      0
dtype: int64
ProductID      0
ProductName    0
Category       0
Price          0
dtype: int64
TransactionID      0
CustomerID         0
ProductID          0
TransactionDate    0
Quantity           0
TotalValue         0
Price              0
dtype: int64
0
0
0


In [7]:
# Duplicates handling
customers.drop_duplicates(inplace=True)
products.drop_duplicates(inplace=True)
transactions.drop_duplicates(inplace=True)

# Handling missing values (impute or drop)
customers.ffill(inplace=True)  # Forward fill for customer data
products.ffill(inplace=True)   # Forward fill for product data
transactions.fillna(0, inplace=True)  # Impute missing transaction data with 0

In [9]:
# Merging customer data with transaction data
transactions_with_customers = pd.merge(transactions, customers, on='CustomerID', how='left')

# Aggregating transaction data by customer (total spending, total quantity purchased, etc.)
customer_summary = transactions_with_customers.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),
    total_quantity=('Quantity', 'sum'),
    avg_spent_per_purchase=('TotalValue', 'mean'),
    num_purchases=('TransactionID', 'nunique'),
).reset_index()

# Merging with customer demographic data
customer_summary = pd.merge(customer_summary, customers[['CustomerID', 'Region']], on='CustomerID', how='left')

In [11]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Sample customer summary DataFrame (replace with actual data)
# customer_summary = pd.read_csv("customer_summary.csv")  # Replace with your actual DataFrame

# Select features for similarity calculation
similarity_features = customer_summary[['total_spent', 'total_quantity', 'avg_spent_per_purchase', 'num_purchases']]

# Normalizing the features
scaler = StandardScaler()
similarity_features_scaled = scaler.fit_transform(similarity_features)

# Calculate cosine similarity
cosine_sim = cosine_similarity(similarity_features_scaled)

# Convert cosine similarity to DataFrame for better visualization
similarity_df = pd.DataFrame(cosine_sim, index=customer_summary['CustomerID'], columns=customer_summary['CustomerID'])

# For each customer (C0001 - C0020), find the top 3 lookalikes
lookalike_results = {}

# Loop through first 20 customers
for customer_id in customer_summary['CustomerID'].head(20):
    similar_customers = similarity_df[customer_id].nlargest(4).iloc[1:]  # Exclude the customer itself
    lookalike_results[customer_id] = [(x[0], x[1]) for x in zip(similar_customers.index, similar_customers.values)]

# Flatten the lookalike results into a list of tuples for DataFrame
flattened_results = []
for customer_id, lookalikes in lookalike_results.items():
    for lookalike in lookalikes:
        flattened_results.append([customer_id, lookalike[0], lookalike[1]])

# Creating the DataFrame for lookalike results
lookalike_df = pd.DataFrame(flattened_results, columns=['CustomerID', 'LookalikeCustomerID', 'SimilarityScore'])

# Preview the first few rows of lookalike results
print(lookalike_df.head())  # To see the first few lookalike results

# Save the results to a CSV file
lookalike_df.to_csv('Lookalike.csv', index=False)

  CustomerID LookalikeCustomerID  SimilarityScore
0      C0001               C0164         0.996031
1      C0001               C0103         0.981548
2      C0001               C0069         0.963423
3      C0002               C0029         0.999525
4      C0002               C0031         0.997756
