In [7]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

customers = pd.read_csv(r"Customers.csv")
products = pd.read_csv(r"Products.csv")
transactions = pd.read_csv(r"Transactions.csv")

customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

max_date = transactions['TransactionDate'].max()
customers['DaysSinceSignup'] = (max_date - customers['SignupDate']).dt.days

agg_transactions = transactions.groupby('CustomerID').agg(
    TotalSpent=('TotalValue', 'sum'),
    AvgQuantity=('Quantity', 'mean'),
    TransactionCount=('TransactionID', 'count')
).reset_index()

merged = pd.merge(customers, agg_transactions, on='CustomerID')

merged_cat = pd.merge(transactions, products[['ProductID', 'Category']], on='ProductID')
category_counts = merged_cat.groupby(['CustomerID', 'Category']).size().reset_index(name='Count')

category_dummies = pd.get_dummies(category_counts, columns=['Category']).groupby('CustomerID').sum()

features = pd.merge(
    merged[['CustomerID', 'Region', 'DaysSinceSignup', 'TotalSpent', 'TransactionCount', 'AvgQuantity']],
    category_dummies,
    on='CustomerID',
    how='left'
).fillna(0)

features.columns = features.columns.astype(str)  
features = pd.get_dummies(features, columns=['Region'])

scaler = StandardScaler()
scaled = scaler.fit_transform(features.drop('CustomerID', axis=1))

similarity = cosine_similarity(scaled)

lookalike_data = []
target_ids = features['CustomerID'].head(20).tolist()

for target_id in target_ids:
    target_idx = features.index[features['CustomerID'] == target_id].tolist()[0]
    sim_scores = list(enumerate(similarity[target_idx]))
    sorted_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:4]  # Exclude self
    
    lookalike_ids = []
    for idx, score in sorted_scores:
        lookalike_id = features.iloc[idx]['CustomerID']
        lookalike_ids.extend([lookalike_id, f"{score:.4f}"])
    
    lookalike_data.append([target_id] + lookalike_ids)

cols = ['CustomerID', 'Lookalike1', 'Score1', 'Lookalike2', 'Score2', 'Lookalike3', 'Score3']
lookalike_df = pd.DataFrame(lookalike_data, columns=cols)
lookalike_df.to_csv('FirstName_LastName_Lookalike.csv', index=False)

In [4]:
%pip install sklearn

Collecting sklearn
  Downloading sklearn-0.0.post12.tar.gz (2.6 kB)
  Downloading sklearn-0.0.post11.tar.gz (3.6 kB)
  Downloading sklearn-0.0.post10.tar.gz (3.6 kB)
  Downloading sklearn-0.0.post9.tar.gz (3.6 kB)
  Downloading sklearn-0.0.post7.tar.gz (3.6 kB)
  Downloading sklearn-0.0.post5.tar.gz (3.7 kB)
  Downloading sklearn-0.0.post4.tar.gz (3.6 kB)
  Downloading sklearn-0.0.post1.tar.gz (3.6 kB)
  Downloading sklearn-0.0.tar.gz (1.1 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp310-cp310-win_amd64.whl (11.1 MB)
Collecting scipy>=1.6.0
  Downloading scipy-1.15.1-cp310-cp310-win_amd64.whl (43.9 MB)
Collecting joblib>=1.2.0
  Downloading joblib-1.4.2-py3-none-any.whl (301 kB)
Collecting threadpoolctl>=3.1.0
  Downloading threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Using legacy 'setup.py install' for sklearn, since package 'wheel' is not installed.
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn, sklearn
    Running setup.py install for sk

    ERROR: Command errored out with exit status 1:
     command: 'c:\Users\RAUNAQUE\AppData\Local\Programs\Python\Python310\python.exe' -c 'import io, os, sys, setuptools, tokenize; sys.argv[0] = '"'"'C:\\Users\\RAUNAQUE\\AppData\\Local\\Temp\\pip-install-av_de5w6\\sklearn_5c931e2ba5d24b8e93c5e7ab88ab2fd5\\setup.py'"'"'; __file__='"'"'C:\\Users\\RAUNAQUE\\AppData\\Local\\Temp\\pip-install-av_de5w6\\sklearn_5c931e2ba5d24b8e93c5e7ab88ab2fd5\\setup.py'"'"';f = getattr(tokenize, '"'"'open'"'"', open)(__file__) if os.path.exists(__file__) else io.StringIO('"'"'from setuptools import setup; setup()'"'"');code = f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' egg_info --egg-base 'C:\Users\RAUNAQUE\AppData\Local\Temp\pip-pip-egg-info-fvvmqb2k'
         cwd: C:\Users\RAUNAQUE\AppData\Local\Temp\pip-install-av_de5w6\sklearn_5c931e2ba5d24b8e93c5e7ab88ab2fd5\
    Complete output (15 lines):
    The 'sklearn' PyPI package is deprecated, use 's