### Importing the necessary libraries

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict

### Loading the datasets

In [2]:
customers = pd.read_csv('Customers_Preprocessed.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions_Preprocessed.csv')

### One Hot Encoding

In [3]:
ohe = OneHotEncoder(sparse=False)
region_encoded = ohe.fit_transform(customers[['Region']])
region_encoded



array([[0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],


In [4]:
region_columns = [f"Region_{region}" for region in ohe.categories_[0]]
region_columns

['Region_Asia',
 'Region_Europe',
 'Region_North America',
 'Region_South America']

In [5]:
customers_encoded = pd.DataFrame(region_encoded, columns=region_columns)
customers_encoded

Unnamed: 0,Region_Asia,Region_Europe,Region_North America,Region_South America
0,0.0,0.0,0.0,1.0
1,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0
...,...,...,...,...
195,0.0,1.0,0.0,0.0
196,0.0,1.0,0.0,0.0
197,0.0,1.0,0.0,0.0
198,0.0,1.0,0.0,0.0


In [6]:
customers = pd.concat([customers, customers_encoded], axis=1)
customers

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate,SignupYear,SignupMonth,Region_Asia,Region_Europe,Region_North America,Region_South America
0,C0001,Lawrence Carroll,South America,2022-07-10,2022,2022-07,0.0,0.0,0.0,1.0
1,C0002,Elizabeth Lutz,Asia,2022-02-13,2022,2022-02,1.0,0.0,0.0,0.0
2,C0003,Michael Rivera,South America,2024-03-07,2024,2024-03,0.0,0.0,0.0,1.0
3,C0004,Kathleen Rodriguez,South America,2022-10-09,2022,2022-10,0.0,0.0,0.0,1.0
4,C0005,Laura Weber,Asia,2022-08-15,2022,2022-08,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
195,C0196,Laura Watts,Europe,2022-06-07,2022,2022-06,0.0,1.0,0.0,0.0
196,C0197,Christina Harvey,Europe,2023-03-21,2023,2023-03,0.0,1.0,0.0,0.0
197,C0198,Rebecca Ray,Europe,2022-02-27,2022,2022-02,0.0,1.0,0.0,0.0
198,C0199,Andrea Jenkins,Europe,2022-12-03,2022,2022-12,0.0,1.0,0.0,0.0


### Aggregating transaction data by CustomerID

We are going to analyse each customer's buying behavior using aggregate values like TotalSpending, TotalQuantity and UniqueProducts

In [7]:
transaction_features = transactions.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'ProductID': 'nunique'
}).rename(columns={
    'TotalValue': 'TotalSpending',
    'Quantity': 'TotalQuantity',
    'ProductID': 'UniqueProducts'
}).reset_index()

transaction_features

Unnamed: 0,CustomerID,TotalSpending,TotalQuantity,UniqueProducts
0,C0001,3354.52,12,5
1,C0002,1862.74,10,4
2,C0003,2725.38,14,4
3,C0004,5354.88,23,8
4,C0005,2034.24,7,3
...,...,...,...,...
194,C0196,4982.88,12,3
195,C0197,1928.65,9,3
196,C0198,931.83,3,2
197,C0199,1979.28,9,4


In [8]:
customer_features = customers.merge(transaction_features, on='CustomerID', how='left').fillna(0)
customer_features

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate,SignupYear,SignupMonth,Region_Asia,Region_Europe,Region_North America,Region_South America,TotalSpending,TotalQuantity,UniqueProducts
0,C0001,Lawrence Carroll,South America,2022-07-10,2022,2022-07,0.0,0.0,0.0,1.0,3354.52,12.0,5.0
1,C0002,Elizabeth Lutz,Asia,2022-02-13,2022,2022-02,1.0,0.0,0.0,0.0,1862.74,10.0,4.0
2,C0003,Michael Rivera,South America,2024-03-07,2024,2024-03,0.0,0.0,0.0,1.0,2725.38,14.0,4.0
3,C0004,Kathleen Rodriguez,South America,2022-10-09,2022,2022-10,0.0,0.0,0.0,1.0,5354.88,23.0,8.0
4,C0005,Laura Weber,Asia,2022-08-15,2022,2022-08,1.0,0.0,0.0,0.0,2034.24,7.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,C0196,Laura Watts,Europe,2022-06-07,2022,2022-06,0.0,1.0,0.0,0.0,4982.88,12.0,3.0
196,C0197,Christina Harvey,Europe,2023-03-21,2023,2023-03,0.0,1.0,0.0,0.0,1928.65,9.0,3.0
197,C0198,Rebecca Ray,Europe,2022-02-27,2022,2022-02,0.0,1.0,0.0,0.0,931.83,3.0,2.0
198,C0199,Andrea Jenkins,Europe,2022-12-03,2022,2022-12,0.0,1.0,0.0,0.0,1979.28,9.0,4.0


### Data Normalization

In [9]:
scaler = MinMaxScaler()
num_features = ['SignupYear', 'TotalSpending', 'TotalQuantity', 'UniqueProducts']
customer_features[num_features] = scaler.fit_transform(customer_features[num_features])
customer_features

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate,SignupYear,SignupMonth,Region_Asia,Region_Europe,Region_North America,Region_South America,TotalSpending,TotalQuantity,UniqueProducts
0,C0001,Lawrence Carroll,South America,2022-07-10,0.0,2022-07,0.0,0.0,0.0,1.0,0.314274,0.37500,0.5
1,C0002,Elizabeth Lutz,Asia,2022-02-13,0.0,2022-02,1.0,0.0,0.0,0.0,0.174514,0.31250,0.4
2,C0003,Michael Rivera,South America,2024-03-07,1.0,2024-03,0.0,0.0,0.0,1.0,0.255332,0.43750,0.4
3,C0004,Kathleen Rodriguez,South America,2022-10-09,0.0,2022-10,0.0,0.0,0.0,1.0,0.501681,0.71875,0.8
4,C0005,Laura Weber,Asia,2022-08-15,0.0,2022-08,1.0,0.0,0.0,0.0,0.190581,0.21875,0.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,C0196,Laura Watts,Europe,2022-06-07,0.0,2022-06,0.0,1.0,0.0,0.0,0.466830,0.37500,0.3
196,C0197,Christina Harvey,Europe,2023-03-21,0.5,2023-03,0.0,1.0,0.0,0.0,0.180689,0.28125,0.3
197,C0198,Rebecca Ray,Europe,2022-02-27,0.0,2022-02,0.0,1.0,0.0,0.0,0.087300,0.09375,0.2
198,C0199,Andrea Jenkins,Europe,2022-12-03,0.0,2022-12,0.0,1.0,0.0,0.0,0.185432,0.28125,0.4


In [10]:
all_features = customer_features[region_columns + num_features]
all_features

Unnamed: 0,Region_Asia,Region_Europe,Region_North America,Region_South America,SignupYear,TotalSpending,TotalQuantity,UniqueProducts
0,0.0,0.0,0.0,1.0,0.0,0.314274,0.37500,0.5
1,1.0,0.0,0.0,0.0,0.0,0.174514,0.31250,0.4
2,0.0,0.0,0.0,1.0,1.0,0.255332,0.43750,0.4
3,0.0,0.0,0.0,1.0,0.0,0.501681,0.71875,0.8
4,1.0,0.0,0.0,0.0,0.0,0.190581,0.21875,0.3
...,...,...,...,...,...,...,...,...
195,0.0,1.0,0.0,0.0,0.0,0.466830,0.37500,0.3
196,0.0,1.0,0.0,0.0,0.5,0.180689,0.28125,0.3
197,0.0,1.0,0.0,0.0,0.0,0.087300,0.09375,0.2
198,0.0,1.0,0.0,0.0,0.0,0.185432,0.28125,0.4


### Cosine Similarity

Cosine Similarity measures the similarity between two vectors.

In [11]:
similarity_matrix = cosine_similarity(all_features)
similarity_matrix

array([[1.        , 0.26859613, 0.76129382, ..., 0.12962216, 0.26411648,
        0.33903986],
       [0.26859613, 1.        , 0.19343222, ..., 0.1067549 , 0.21881319,
        0.90512711],
       [0.76129382, 0.19343222, 1.        , ..., 0.08969041, 0.18833544,
        0.47582083],
       ...,
       [0.12962216, 0.1067549 , 0.08969041, ..., 1.        , 0.96781928,
        0.12949085],
       [0.26411648, 0.21881319, 0.18833544, ..., 0.96781928, 1.        ,
        0.26869925],
       [0.33903986, 0.90512711, 0.47582083, ..., 0.12949085, 0.26869925,
        1.        ]])

In [12]:
lookalike_results = defaultdict(list)

for i, customer_id in enumerate(customer_features['CustomerID'][:]):
    similarity_scores = similarity_matrix[i]
    top_indices = np.argsort(-similarity_scores)[1:4]
    top_customers = customer_features.iloc[top_indices][['CustomerID']].values.flatten()
    top_scores = similarity_scores[top_indices]
    
    lookalike_results[customer_id] = list(zip(top_customers, top_scores))

lookalike_results

defaultdict(list,
            {'C0001': [('C0174', 0.9994009524324653),
              ('C0011', 0.9993929212174698),
              ('C0152', 0.9987651788245898)],
             'C0002': [('C0027', 0.9972484435911428),
              ('C0159', 0.9950099901581653),
              ('C0005', 0.9933284578274642)],
             'C0003': [('C0190', 0.9980730091178139),
              ('C0031', 0.9960594812464495),
              ('C0191', 0.9960222683025667)],
             'C0004': [('C0113', 0.998437721375097),
              ('C0104', 0.9965264907801978),
              ('C0102', 0.9964462978534467)],
             'C0005': [('C0159', 0.9996045162448027),
              ('C0007', 0.9986136741514055),
              ('C0002', 0.9933284578274642)],
             'C0006': [('C0048', 0.9978218308503074),
              ('C0187', 0.9977391266933187),
              ('C0137', 0.9963436081993936)],
             'C0007': [('C0159', 0.9990960415792354),
              ('C0005', 0.9986136741514055),
              

In [13]:
lookalike_df = pd.DataFrame({
    'CustomerID': lookalike_results.keys(),
    'Lookalikes': [str(v) for v in lookalike_results.values()]
})
lookalike_df.to_csv("Lookalike.csv", index=False)
lookalike_df.info

<bound method DataFrame.info of     CustomerID                                         Lookalikes
0        C0001  [('C0174', 0.9994009524324653), ('C0011', 0.99...
1        C0002  [('C0027', 0.9972484435911428), ('C0159', 0.99...
2        C0003  [('C0190', 0.9980730091178139), ('C0031', 0.99...
3        C0004  [('C0113', 0.998437721375097), ('C0104', 0.996...
4        C0005  [('C0159', 0.9996045162448027), ('C0007', 0.99...
..         ...                                                ...
195      C0196  [('C0070', 0.9860286066501067), ('C0074', 0.98...
196      C0197  [('C0061', 0.9968805614990002), ('C0132', 0.99...
197      C0198  [('C0060', 0.9918475626915497), ('C0062', 0.98...
198      C0199  [('C0166', 0.998113407756776), ('C0010', 0.996...
199      C0200  [('C0022', 0.9973708928483915), ('C0101', 0.99...

[200 rows x 2 columns]>

In [14]:
lookalike_df.head(20)

Unnamed: 0,CustomerID,Lookalikes
0,C0001,"[('C0174', 0.9994009524324653), ('C0011', 0.99..."
1,C0002,"[('C0027', 0.9972484435911428), ('C0159', 0.99..."
2,C0003,"[('C0190', 0.9980730091178139), ('C0031', 0.99..."
3,C0004,"[('C0113', 0.998437721375097), ('C0104', 0.996..."
4,C0005,"[('C0159', 0.9996045162448027), ('C0007', 0.99..."
5,C0006,"[('C0048', 0.9978218308503074), ('C0187', 0.99..."
6,C0007,"[('C0159', 0.9990960415792354), ('C0005', 0.99..."
7,C0008,"[('C0098', 0.9914403319247044), ('C0049', 0.99..."
8,C0009,"[('C0061', 0.9921726585988702), ('C0121', 0.98..."
9,C0010,"[('C0166', 0.9980690095811071), ('C0199', 0.99..."
