In [36]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [37]:
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

In [38]:
# aggregating transaction data
customer_transactions = transactions.groupby('CustomerID').agg(
    TotalSpent=('TotalValue', 'sum'),
    AvgOrderValue=('TotalValue', 'mean'),
    TotalOrders=('TransactionID', 'count'),
    UniqueProducts=('ProductID', 'nunique'),
    TotalQuantity=('Quantity', 'sum')
).reset_index()

In [39]:
customer_profile = pd.merge(customers, customer_transactions, on='CustomerID', how='left')

In [40]:
customer_profile.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate,TotalSpent,AvgOrderValue,TotalOrders,UniqueProducts,TotalQuantity
0,C0001,Lawrence Carroll,South America,2022-07-10,3354.52,670.904,5.0,5.0,12.0
1,C0002,Elizabeth Lutz,Asia,2022-02-13,1862.74,465.685,4.0,4.0,10.0
2,C0003,Michael Rivera,South America,2024-03-07,2725.38,681.345,4.0,4.0,14.0
3,C0004,Kathleen Rodriguez,South America,2022-10-09,5354.88,669.36,8.0,8.0,23.0
4,C0005,Laura Weber,Asia,2022-08-15,2034.24,678.08,3.0,3.0,7.0


In [41]:
# feature engineering
numeric_features = ['TotalSpent', 'AvgOrderValue', 'TotalOrders', 'UniqueProducts', 'TotalQuantity']
scaler = StandardScaler()
customer_profile[numeric_features] = scaler.fit_transform(customer_profile[numeric_features])

In [59]:
customer_profile = pd.get_dummies(customer_profile, columns=['Region'], drop_first=False)
# customer_profile.to_csv('customer_profile.csv', index=False)

In [60]:
customer_profile.head()


Unnamed: 0,CustomerID,CustomerName,SignupDate,TotalSpent,AvgOrderValue,TotalOrders,UniqueProducts,TotalQuantity,Region_Asia,Region_Europe,Region_North America,Region_South America
0,C0001,Lawrence Carroll,2022-07-10,-0.061701,-0.070263,-0.011458,0.050047,-0.122033,0,0,0,1
1,C0002,Elizabeth Lutz,2022-02-13,-0.877744,-0.934933,-0.467494,-0.424204,-0.448,1,0,0,0
2,C0003,Michael Rivera,2024-03-07,-0.405857,-0.026271,-0.467494,-0.424204,0.203934,0,0,0,1
3,C0004,Kathleen Rodriguez,2022-10-09,1.032547,-0.076769,1.35665,1.472798,1.670787,0,0,0,1
4,C0005,Laura Weber,2022-08-15,-0.783929,-0.040028,-0.92353,-0.898455,-0.936951,1,0,0,0


In [74]:
customer_features = customer_profile[features]

# Check for any NaN values
print("NaN Count per Column:")
print(customer_features.isnull().sum())

# Check for non-numerical or problematic values
print("Data Types:")
print(customer_features.dtypes)

# Check for infinite values
print("Infinite Values:")
print(np.isinf(customer_features).sum())


NaN Count per Column:
TotalSpent              1
AvgOrderValue           1
TotalOrders             1
UniqueProducts          1
TotalQuantity           1
Region_Asia             0
Region_Europe           0
Region_North America    0
Region_South America    0
dtype: int64
Data Types:
TotalSpent              float64
AvgOrderValue           float64
TotalOrders             float64
UniqueProducts          float64
TotalQuantity           float64
Region_Asia               uint8
Region_Europe             uint8
Region_North America      uint8
Region_South America      uint8
dtype: object
Infinite Values:
TotalSpent              0
AvgOrderValue           0
TotalOrders             0
UniqueProducts          0
TotalQuantity           0
Region_Asia             0
Region_Europe           0
Region_North America    0
Region_South America    0
dtype: int64


In [75]:
customer_features.fillna(0, inplace=True)
customer_features.replace([np.inf, -np.inf], 0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  customer_features.fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  customer_features.replace([np.inf, -np.inf], 0, inplace=True)


In [76]:
customer_features = customer_features.astype('float64')
assert all(customer_features.dtypes == 'float64'), "Non-numeric columns found!"

In [None]:
# creating similarity matrix
similarity_matrix = cosine_similarity(customer_features)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_profile['CustomerID'], columns=customer_profile['CustomerID'])

In [78]:
# top 3 similar customers for each customer
lookalikes = {}
for customer_id in customer_profile['CustomerID']:
    similar_customers = similarity_df[customer_id].sort_values(ascending=False)[1:4]
    lookalikes[customer_id] = [(similar_customer, score) for similar_customer, score in similar_customers.items()]

In [79]:
# creating lookalike csv file
lookalike_list = []

for customer_id, similar_list in lookalikes.items():
    lookalike_list.append({
        'cust_id': customer_id,
        'similar_customers': similar_list
    })

lookalike_df = pd.DataFrame(lookalike_list)

lookalike_df.head(20).to_csv('PremCharan_Pampana_Lookalike.csv', index=False) 