In [14]:
pip install -U scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [16]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from datetime import datetime

In [18]:
customers_df = pd.read_csv('Customers.csv')
products_df = pd.read_csv('Products.csv')
transactions_df = pd.read_csv('Transactions.csv')
customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
current_date = pd.to_datetime('today')
customers_df['CustomerAgeInMonths'] = (current_date - customers_df['SignupDate']).dt.days / 30
region_encoder = OneHotEncoder()
region_encoded = region_encoder.fit_transform(customers_df[['Region']])
region_encoded_df = pd.DataFrame(region_encoded.toarray(), columns=region_encoder.get_feature_names_out())

customers_df = pd.concat([customers_df, region_encoded_df], axis=1)

In [20]:
customer_transactions = transactions_df.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),
    transaction_count=('TransactionID', 'count')
).reset_index()
customer_data = pd.merge(customers_df, customer_transactions, on='CustomerID')
category_purchases = transactions_df.merge(products_df[['ProductID', 'Category']], on='ProductID')
category_distribution = category_purchases.groupby('CustomerID')['Category'].value_counts(normalize=True).unstack(fill_value=0)
customer_data = pd.merge(customer_data, category_distribution, left_on='CustomerID', right_index=True, how='left')

In [22]:
numerical_features = ['total_spent', 'transaction_count', 'CustomerAgeInMonths'] + list(region_encoded_df.columns) + list(category_distribution.columns)
scaler = StandardScaler()
customer_data_scaled = scaler.fit_transform(customer_data[numerical_features])
similarity_matrix = cosine_similarity(customer_data_scaled)

In [24]:
lookalike_dict = {}
for i in range(20):
    similarities = similarity_matrix[i]
    sorted_similarities = sorted(enumerate(similarities), key=lambda x: x[1], reverse=True)
    top_3 = [(customer_data.iloc[j[0]]['CustomerID'], j[1]) for j in sorted_similarities[1:4]]
    lookalike_dict[customer_data.iloc[i]['CustomerID']] = top_3

In [30]:
lookalike_list = []
for cust_id, lookalikes in lookalike_dict.items():
    for lookalike in lookalikes:
        lookalike_list.append([cust_id, lookalike[0], lookalike[1]])

lookalike_df = pd.DataFrame(lookalike_list, columns=['CustomerID', 'LookalikeCustomerID', 'SimilarityScore'])

lookalike_df.to_csv('Lookalike.csv', index=False)

print("Lookalike recommendations saved to 'Sahil_Shukla_Lookalike.csv'.")

Lookalike recommendations saved to 'Sahil_Shukla_Lookalike.csv'.
