In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [3]:
# Load datasets
customers = pd.read_csv("/content/drive/My Drive/SuperTeam_Write/customers.csv")
purchases = pd.read_csv("/content/drive/My Drive/SuperTeam_Write/purchases.csv")
sessions = pd.read_csv("/content/drive/My Drive/SuperTeam_Write/sessions.csv")
reviews = pd.read_csv("/content/drive/My Drive/SuperTeam_Write/reviews.csv")

In [4]:
customer_org = customers.copy()
sessions_org = sessions.copy()
purchases_org = purchases.copy()
reviews_org = reviews.copy()

In [5]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# Initialize encoders and scalers
label_encoder = LabelEncoder()
scaler = MinMaxScaler()

# Encode categorical features
customers['gender'] = label_encoder.fit_transform(customers['gender'])
sessions['page_viewed'] = label_encoder.fit_transform(sessions['page_viewed'])

# Normalize numeric features
customers[['age']] = scaler.fit_transform(customers[['age']])
sessions[['view_duration']] = scaler.fit_transform(sessions[['view_duration']])
purchases[['purchase_amount']] = scaler.fit_transform(purchases[['purchase_amount']])

In [6]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch

# Load the reviews dataset
reviews_df = pd.read_csv('/content/drive/My Drive/SuperTeam_Write/reviews.csv')

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to convert text to BERT embeddings and return as a formatted string
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', max_length=128, truncation=True, padding='max_length')
    with torch.no_grad():
        outputs = model(**inputs)
    # Get the embedding and convert it to a formatted string
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy().tolist()
    return str(embedding)

# Apply the embedding function to the review_text column
reviews_df['review_embedding'] = reviews_df['review_text'].apply(get_bert_embedding)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [7]:
from sklearn.decomposition import PCA
import numpy as np


# Convert the review embeddings from string to numpy array
reviews_df['review_embedding'] = reviews_df['review_embedding'].apply(
    lambda x: np.fromstring(x.strip("[]"), sep=',')
)

# Aggregate the review embeddings for each customer by taking the mean of their embeddings
customer_embeddings = reviews_df.groupby('customer_id')['review_embedding'].apply(np.mean).reset_index()

# Grouping by customer_id and aggregating numeric columns only
# Sessions
numeric_sessions_df = sessions.select_dtypes(include=[np.number])
sessions_agg_df = sessions[['customer_id']].join(numeric_sessions_df).groupby('customer_id').mean().reset_index()

# Purchases
numeric_purchases_df = purchases.select_dtypes(include=[np.number])
purchases_agg_df = purchases[['customer_id']].join(numeric_purchases_df).groupby('customer_id').mean().reset_index()

# Merge customer embeddings with other customer features
merged_df = customers.merge(customer_embeddings, on='customer_id', how='left')
merged_df = merged_df.merge(sessions_agg_df, on='customer_id', how='left')
merged_df = merged_df.merge(purchases_agg_df, on='customer_id', how='left')

# Fill any missing values that resulted from the merge
merged_df = merged_df.fillna(0)

# Combine all features into a single vector for each customer
feature_columns = ['age', 'gender', 'view_duration', 'purchase_amount', 'review_embedding']
merged_df['combined_vector'] = merged_df.apply(
    lambda row: np.concatenate([row['review_embedding'], [row['age'], row['gender'], row['view_duration'], row['purchase_amount']]]),
    axis=1
)

# Convert the combined_vector and review_embedding back to string format
merged_df['combined_vector'] = merged_df['combined_vector'].apply(lambda x: str(list(x)))
merged_df['review_embedding'] = merged_df['review_embedding'].apply(lambda x: str(list(x)))

# Display the first few rows after combining vectors
print(merged_df.head())


  customer_id       age  gender            location signup_date  \
0     CUST001  0.723404       0         Robertsbury  2023-11-21   
1     CUST002  0.468085       1     Lake Heidimouth  2023-08-10   
2     CUST003  0.212766       1  South Stephenville  2024-05-25   
3     CUST004  0.957447       1         Brandyshire  2023-11-05   
4     CUST005  0.106383       0          Wrightberg  2023-03-10   

                                    review_embedding  page_viewed  \
0  [0.3026423727472623, -0.20583192942043146, 0.3...     5.800000   
1  [-0.027535068492094677, -0.2214363068342209, 0...     7.454545   
2  [0.09626923501491547, -0.21890243291854858, 0....    17.500000   
3  [0.011832010932266712, -0.29787568747997284, 0...     2.125000   
4  [0.4246213883161545, -0.2865287885069847, 0.23...    14.428571   

   view_duration  purchase_amount  \
0       0.443051         0.356144   
1       0.536518         0.438716   
2       0.611582         0.663063   
3       0.520551         0.425212 

In [8]:
# Apply PCA to reduce the dimensionality of the combined vectors
pca = PCA(n_components=5)  # Set n_components to a feasible value, in this case 5
reduced_vectors = pca.fit_transform(np.stack(merged_df['combined_vector'].apply(eval)))

# Assign the reduced vectors to a new column and convert to JSON-like string format
merged_df['reduced_vector'] = reduced_vectors.tolist()
merged_df['reduced_vector'] = merged_df['reduced_vector'].apply(lambda x: str(x))

# Display the first few rows after adding the reduced vectors
print(merged_df.head())

  customer_id       age  gender            location signup_date  \
0     CUST001  0.723404       0         Robertsbury  2023-11-21   
1     CUST002  0.468085       1     Lake Heidimouth  2023-08-10   
2     CUST003  0.212766       1  South Stephenville  2024-05-25   
3     CUST004  0.957447       1         Brandyshire  2023-11-05   
4     CUST005  0.106383       0          Wrightberg  2023-03-10   

                                    review_embedding  page_viewed  \
0  [0.3026423727472623, -0.20583192942043146, 0.3...     5.800000   
1  [-0.027535068492094677, -0.2214363068342209, 0...     7.454545   
2  [0.09626923501491547, -0.21890243291854858, 0....    17.500000   
3  [0.011832010932266712, -0.29787568747997284, 0...     2.125000   
4  [0.4246213883161545, -0.2865287885069847, 0.23...    14.428571   

   view_duration  purchase_amount  \
0       0.443051         0.356144   
1       0.536518         0.438716   
2       0.611582         0.663063   
3       0.520551         0.425212 

In [9]:
# saving the dataframe to a csv file as the normalized data for clustering
merged_df.to_csv("/content/drive/My Drive/SuperTeam_Write/customers_vect.csv", index=False)

In [10]:
# merging the original customer information with the new features for the customer similarity search
df = customer_org.merge(merged_df[['customer_id', 'page_viewed', 'view_duration', 'purchase_amount', 'review_embedding', 'combined_vector', 'reduced_vector', 'review_embedding']], on='customer_id', how='left')

In [11]:
# save to csv file
df.to_csv("/content/drive/My Drive/SuperTeam_Write/customers.csv", index=False)