In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [3]:
transactions =  transactions.merge(products, on='ProductID', how='left')
df = transactions.merge(customers, on='CustomerID', how='left')

Feature Extraction

In [4]:
# 1. Total transaction and spending
customer_features = df.groupby('CustomerID').agg(
    total_transactions=('TransactionID','count'),
    total_spending=('TotalValue','sum'),
    avg_transaction_value=('TotalValue','mean')
).reset_index()

In [5]:
print(customer_features)

    CustomerID  total_transactions  total_spending  avg_transaction_value
0        C0001                   5         3354.52             670.904000
1        C0002                   4         1862.74             465.685000
2        C0003                   4         2725.38             681.345000
3        C0004                   8         5354.88             669.360000
4        C0005                   3         2034.24             678.080000
..         ...                 ...             ...                    ...
194      C0196                   4         4982.88            1245.720000
195      C0197                   3         1928.65             642.883333
196      C0198                   2          931.83             465.915000
197      C0199                   4         1979.28             494.820000
198      C0200                   5         4758.60             951.720000

[199 rows x 4 columns]


In [6]:
# 2. Most purchased product category
top_category = df.groupby(['CustomerID','Category']).agg(total=('Quantity','sum')).reset_index()

In [7]:
top_category

Unnamed: 0,CustomerID,Category,total
0,C0001,Books,2
1,C0001,Electronics,7
2,C0001,Home Decor,3
3,C0002,Clothing,4
4,C0002,Home Decor,6
...,...,...,...
562,C0199,Home Decor,6
563,C0200,Books,4
564,C0200,Clothing,7
565,C0200,Electronics,1


In [22]:
top_category = top_category.loc[top_category.groupby('CustomerID')['total'].idxmax()]

KeyError: 'Column not found: Total'

In [9]:
top_category

Unnamed: 0,CustomerID,Category,total
1,C0001,Electronics,7
4,C0002,Home Decor,6
7,C0003,Home Decor,6
10,C0004,Home Decor,9
11,C0005,Electronics,4
...,...,...,...
556,C0196,Home Decor,5
557,C0197,Electronics,6
559,C0198,Clothing,2
562,C0199,Home Decor,6


In [10]:
customer_features = customer_features.merge(top_category[['CustomerID','Category']], on='CustomerID', how='left')

In [11]:
customer_features

Unnamed: 0,CustomerID,total_transactions,total_spending,avg_transaction_value,Category
0,C0001,5,3354.52,670.904000,Electronics
1,C0002,4,1862.74,465.685000,Home Decor
2,C0003,4,2725.38,681.345000,Home Decor
3,C0004,8,5354.88,669.360000,Home Decor
4,C0005,3,2034.24,678.080000,Electronics
...,...,...,...,...,...
194,C0196,4,4982.88,1245.720000,Home Decor
195,C0197,3,1928.65,642.883333,Electronics
196,C0198,2,931.83,465.915000,Clothing
197,C0199,4,1979.28,494.820000,Home Decor


# 3. Region and SignupDate

In [12]:
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])

In [13]:
customers['SignupDate'].head(5)

0   2022-07-10
1   2022-02-13
2   2024-03-07
3   2022-10-09
4   2022-08-15
Name: SignupDate, dtype: datetime64[ns]

In [14]:
customers['SignupDate'] = pd.to_datetime(customers['SignupDate']).astype('int64')//10**9
customer_features=customer_features.merge(customers[['CustomerID','Region','SignupDate']], on='CustomerID', how='left')
customer_features=pd.get_dummies(customer_features, columns=['Region','Category'],drop_first=True)

In [15]:
scalar = MinMaxScaler()

In [16]:
num_features = ['total_transactions','total_spending','avg_transaction_value','SignupDate']
customer_features[num_features] = scalar.fit_transform(customer_features[num_features])

In [18]:
print(customer_features[num_features].head())

   total_transactions  total_spending  avg_transaction_value  SignupDate
0                 0.4        0.308942               0.474336    0.157796
1                 0.3        0.168095               0.308940    0.020542
2                 0.3        0.249541               0.482751    0.723623
3                 0.7        0.497806               0.473092    0.242764
4                 0.2        0.184287               0.480120    0.191410


In [45]:
features_matrix =  customer_features.drop(['CustomerID'],axis=1).values
similarity_matrix = cosine_similarity(features_matrix)

In [46]:
lookalike_results={}

In [49]:
for idx, customer_id in enumerate(customer_features['CustomerID']):
    similar_indicies = similarity_matrix[idx].argsort()[::-1][1:4]
    similar_customers = customer_features.iloc[similar_indicies]['CustomerID'].values
    similarity_scores = similarity_matrix[idx][similar_indicies]
    lookalike_results[customer_id] = list(zip(similar_customers,similarity_scores))

In [51]:
lookalike_list=[]
for customer_id, lookalikes in lookalike_results.items():
    for similar_id, score in lookalikes:
        lookalike_list.append({'cust_id':customer_id,'similar_cust_id':similar_id, 'score':score})
lookalike_df = pd.DataFrame(lookalike_list)
lookalike_df.to_csv('Lookalike.csv',index=False)

In [55]:
print("Top 3 lookalikes for first 20 customers:")
print(lookalike_df[lookalike_df['cust_id'].isin(customers['CustomerID'][:20])])

Top 3 lookalikes for first 20 customers:
   cust_id similar_cust_id     score
0    C0001           C0192  0.991830
1    C0001           C0184  0.987201
2    C0001           C0091  0.984834
3    C0002           C0159  0.974162
4    C0002           C0128  0.901457
5    C0002           C0090  0.882338
6    C0003           C0052  0.999563
7    C0003           C0076  0.992866
8    C0003           C0195  0.989499
9    C0004           C0108  0.998212
10   C0004           C0113  0.997400
11   C0004           C0104  0.987828
12   C0005           C0007  0.992184
13   C0005           C0140  0.962011
14   C0005           C0045  0.918859
15   C0006           C0187  0.988664
16   C0006           C0126  0.978177
17   C0006           C0137  0.973324
18   C0007           C0005  0.992184
19   C0007           C0140  0.959034
20   C0007           C0045  0.934189
21   C0008           C0098  0.982737
22   C0008           C0156  0.979190
23   C0008           C0034  0.976531
24   C0009           C0010  0.9832

In [56]:
customer_id = 'C0005'
print(lookalike_df[lookalike_df['cust_id']==customer_id])

   cust_id similar_cust_id     score
12   C0005           C0007  0.992184
13   C0005           C0140  0.962011
14   C0005           C0045  0.918859
