In [1]:
import pandas as pd


customers_file = 'Customers.csv'
products_file = 'Products.csv'
transactions_file = 'Transactions.csv'

customers_df = pd.read_csv(customers_file)
products_df = pd.read_csv(products_file)
transactions_df = pd.read_csv(transactions_file)

customers_df_head = customers_df.head()
products_df_head = products_df.head()
transactions_df_head = transactions_df.head()

customers_df_head, products_df_head, transactions_df_head


(  CustomerID        CustomerName         Region  SignupDate
 0      C0001    Lawrence Carroll  South America  2022-07-10
 1      C0002      Elizabeth Lutz           Asia  2022-02-13
 2      C0003      Michael Rivera  South America  2024-03-07
 3      C0004  Kathleen Rodriguez  South America  2022-10-09
 4      C0005         Laura Weber           Asia  2022-08-15,
   ProductID              ProductName     Category   Price
 0      P001     ActiveWear Biography        Books  169.30
 1      P002    ActiveWear Smartwatch  Electronics  346.30
 2      P003  ComfortLiving Biography        Books   44.12
 3      P004            BookWorld Rug   Home Decor   95.69
 4      P005          TechPro T-Shirt     Clothing  429.31,
   TransactionID CustomerID ProductID      TransactionDate  Quantity  \
 0        T00001      C0199      P067  2024-08-25 12:38:23         1   
 1        T00112      C0146      P067  2024-05-27 22:23:54         1   
 2        T00166      C0127      P067  2024-04-25 07:38:55    

In [2]:

transactions_products = transactions_df.merge(products_df, on="ProductID", how="left")
customer_transactions = transactions_products.merge(customers_df, on="CustomerID", how="left")


customer_transactions_head = customer_transactions.head()


customer_features = customer_transactions.groupby("CustomerID").agg({
    "Category": lambda x: list(x),  
    "TotalValue": "sum",            
    "Region": "first"               
}).reset_index()


customer_features_head = customer_features.head()

customer_transactions_head, customer_features_head


(  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
 0        T00001      C0199      P067  2024-08-25 12:38:23         1   
 1        T00112      C0146      P067  2024-05-27 22:23:54         1   
 2        T00166      C0127      P067  2024-04-25 07:38:55         1   
 3        T00272      C0087      P067  2024-03-26 22:55:37         2   
 4        T00363      C0070      P067  2024-03-21 15:10:10         3   
 
    TotalValue  Price_x                      ProductName     Category  Price_y  \
 0      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
 1      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
 2      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
 3      601.36   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
 4      902.04   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
 
       CustomerName         Region  SignupDate  
 0   Andrea Jenkins    

In [3]:

from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity

mlb = MultiLabelBinarizer()
category_encoded = mlb.fit_transform(customer_features['Category'])
category_df = pd.DataFrame(category_encoded, columns=mlb.classes_)

le = LabelEncoder()
customer_features['RegionEncoded'] = le.fit_transform(customer_features['Region'])


customer_matrix = pd.concat(
    [category_df, customer_features[['TotalValue', 'RegionEncoded']].reset_index(drop=True)],
    axis=1
)


similarity_scores = cosine_similarity(customer_matrix)


def get_top_lookalikes(similarity_matrix, customer_index, top_n=3):
    scores = list(enumerate(similarity_matrix[customer_index]))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    top_lookalikes = [(idx, score) for idx, score in scores if idx != customer_index][:top_n]
    return top_lookalikes


lookalike_map = {}
for i in range(20):
    customer_id = customer_features.iloc[i]['CustomerID']
    lookalikes = get_top_lookalikes(similarity_scores, i)
    lookalike_map[customer_id] = [
        (customer_features.iloc[idx]['CustomerID'], score) for idx, score in lookalikes
    ]


lookalike_df = pd.DataFrame.from_dict(
    {key: str(value) for key, value in lookalike_map.items()},
    orient='index',
    columns=['Lookalikes']
)


lookalike_csv_path = 'Samvid_Verma_Lookalike.csv'
lookalike_df.to_csv(lookalike_csv_path, index_label='CustomerID')

lookalike_map, lookalike_csv_path


({'C0001': [('C0152', np.float64(0.9999999999543174)),
   ('C0174', np.float64(0.99999998635848)),
   ('C0035', np.float64(0.9999999809217809))],
  'C0002': [('C0159', np.float64(0.9999999970500388)),
   ('C0134', np.float64(0.999999973232126)),
   ('C0178', np.float64(0.9999999037426132))],
  'C0003': [('C0091', np.float64(0.9999999860533897)),
   ('C0129', np.float64(0.9999999780383263)),
   ('C0158', np.float64(0.9999999425495927))],
  'C0004': [('C0148', np.float64(0.9999999999256067)),
   ('C0012', np.float64(0.9999999998831532)),
   ('C0018', np.float64(0.9999999891684433))],
  'C0005': [('C0140', np.float64(0.9999999985531377)),
   ('C0007', np.float64(0.9999999891922983)),
   ('C0106', np.float64(0.999999919214237))],
  'C0006': [('C0187', np.float64(0.9999999976625852)),
   ('C0108', np.float64(0.9999999944933381)),
   ('C0185', np.float64(0.9999999874980117))],
  'C0007': [('C0005', np.float64(0.9999999891922983)),
   ('C0140', np.float64(0.9999999798366417)),
   ('C0106', np