In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics.pairwise import cosine_similarity
import random
import torch.nn.functional as F
import os
import numpy as np
import psycopg2
import torch
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder, StandardScaler
from psycopg2.extras import DictCursor
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [2]:
# save to user_data
connection = psycopg2.connect(host="localhost", user="root", port=5432, database="W9sV6cL2dX", password="E5rG7tY3fH")
cursor = connection.cursor(cursor_factory=DictCursor)

In [3]:

select_query = "SELECT ts.user_id, ts.item_id, ts.time_spent, u.*, ie.* \
FROM time_spent AS ts \
JOIN users AS u ON ts.user_id = u.id \
JOIN item_embeddings AS ie ON ts.item_id = ie.item_id;"
cursor.execute(select_query)
data = cursor.fetchall()


In [4]:
data_col = ['user_id','item_id', 'time_spent','id' ,'gender', 'country', 'age', 'item_id2','embedding','time_stamp']
data_df = pd.DataFrame(data, columns=data_col)
data_df = data_df.drop(columns=['id', 'time_stamp', 'item_id2'])
data_df[:3]

Unnamed: 0,user_id,item_id,time_spent,gender,country,age,embedding
0,6611,3b2c0422-4e17-4fd1-9835-93492092b6e6,0,male,BayesianBourg,32.0,"[-1.2004772424697876, 0.26573532819747925, 0.2..."
1,22857,af427d2e-d34c-40a7-aabf-ceab900e3389,0,female,FibonacciFlats,50.0,"[-0.20604084432125092, -0.8583592772483826, -1..."
2,43885,cb91fe20-d5df-4a80-b173-bb07bf5c5134,0,,,,"[-0.7141339778900146, -1.4657678604125977, -1...."


In [5]:
countries = list(set(data_df.country))
genders = list(set(data_df.gender))
gender_encoder = LabelEncoder()
gender_encoder.fit(genders)
country_encoder = LabelEncoder()
country_encoder.fit(countries)
age_scaler = MinMaxScaler()
time_scaler = MinMaxScaler()

data_df['gender_encode'] = gender_encoder.transform(data_df['gender'])
data_df['country_encode'] = country_encoder.transform(data_df['country'])
data_df['age_normalized'] = age_scaler.fit_transform(data_df[['age']])
data_df['time_normalized'] = age_scaler.fit_transform(data_df[['time_spent']])
data_df = data_df.dropna()
print(data_df.shape)
data_df[:3]

(8017, 11)


Unnamed: 0,user_id,item_id,time_spent,gender,country,age,embedding,gender_encode,country_encode,age_normalized,time_normalized
0,6611,3b2c0422-4e17-4fd1-9835-93492092b6e6,0,male,BayesianBourg,32.0,"[-1.2004772424697876, 0.26573532819747925, 0.2...",1,2,0.336957,0.0
1,22857,af427d2e-d34c-40a7-aabf-ceab900e3389,0,female,FibonacciFlats,50.0,"[-0.20604084432125092, -0.8583592772483826, -1...",0,3,0.532609,0.0
3,43273,361ebe6a-294b-4c15-8935-21c6c2709a45,0,male,FibonacciFlats,40.0,"[-0.8985607028007507, 0.19602474570274353, 0.4...",1,3,0.423913,0.0


In [6]:
data_df[:3]

Unnamed: 0,user_id,item_id,time_spent,gender,country,age,embedding,gender_encode,country_encode,age_normalized,time_normalized
0,6611,3b2c0422-4e17-4fd1-9835-93492092b6e6,0,male,BayesianBourg,32.0,"[-1.2004772424697876, 0.26573532819747925, 0.2...",1,2,0.336957,0.0
1,22857,af427d2e-d34c-40a7-aabf-ceab900e3389,0,female,FibonacciFlats,50.0,"[-0.20604084432125092, -0.8583592772483826, -1...",0,3,0.532609,0.0
3,43273,361ebe6a-294b-4c15-8935-21c6c2709a45,0,male,FibonacciFlats,40.0,"[-0.8985607028007507, 0.19602474570274353, 0.4...",1,3,0.423913,0.0


In [7]:
user_features = data_df[['gender_encode', 'country_encode', 'age_normalized']]
item_features = np.array(data_df['embedding'].tolist())  #np.array(data_df.apply(lambda row: row['embedding'], axis=1)) 
item_features = data_df.apply(lambda row: row['embedding'], axis=1)
targets = data_df['time_normalized']

In [8]:
user_features = torch.tensor(user_features.values, dtype=torch.float32)
item_features = [torch.tensor(item, dtype=torch.float32) for item in item_features]
targets = torch.tensor(targets.values, dtype=torch.float32)

In [9]:
type(user_features), type(item_features[1]), type(targets)

(torch.Tensor, torch.Tensor, torch.Tensor)

In [10]:
def seed_everything(seed=42):
    random.seed(seed)
    torch.manual_seed(seed) #for CPU
    torch.cuda.manual_seed(seed) #for GPU
    torch.backends.cudnn.deterministic = True


# Two-tower model architecture
class TwoTowerNetwork(nn.Module):
    def __init__(self, user_input_dim, item_input_dim, output_dim):
        super(TwoTowerNetwork, self).__init__()

        hidden_dim = 128  # example value, adjust based on your requirement

        # User tower
        self.user_tower = nn.Sequential(
            nn.Linear(user_input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )

        # Item tower
        self.item_tower = nn.Sequential(
            nn.Linear(item_input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )


    def forward(self, user_input, item_input):
        # User tower
        user_output = self.user_tower(user_input)

        # Item tower
        item_output = self.item_tower(item_input)

        # Normalize the embeddings (this is necessary for cosine similarity)
        user_output = F.normalize(user_output, dim=1)
        item_output = F.normalize(item_output, dim=1)
        
        return user_output, item_output
    

# Custom dataset
class CustomDataset(Dataset):
    def __init__(self, user_features, item_features, targets):
        self.user_features = user_features
        self.item_features = item_features
        self.targets = targets
        
    def __len__(self):
        return len(self.targets)
    
    def __getitem__(self, idx):
        return self.user_features[idx], self.item_features[idx], self.targets[idx]

In [11]:
# Hyperparameters
user_feature_dim = 3 # age, F/M, country
item_feature_dim = 1000 # item_id, item_type, item_features
embedding_dim = 100
batch_size = 8
learning_rate = 0.001
num_epochs = 10
seed_everything(seed=666)

In [12]:
# Create the two-tower model
model = TwoTowerNetwork(user_feature_dim, item_feature_dim, embedding_dim)

# Create the custom dataset and data loader
dataset = CustomDataset(user_features, item_features, targets)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Initialize the optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.MSELoss()

In [13]:
# Training loop
for epoch in range(num_epochs):
    total_loss = 0.0
    for user_feats, item_feats, targets in data_loader:
        model.train()
        optimizer.zero_grad()
        
        user_embeds, item_embeds = model(user_feats, item_feats)
    
        cos_sim = nn.functional.cosine_similarity(user_embeds, item_embeds)
        loss = criterion(cos_sim.unsqueeze(1), targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(data_loader)}")

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 1/10, Loss: 0.038167137037407296
Epoch 2/10, Loss: 0.03687771655783263
Epoch 3/10, Loss: 0.03659563809467279
Epoch 4/10, Loss: 0.03668415546719011
Epoch 5/10, Loss: 0.03657426186145496
Epoch 6/10, Loss: 0.03656280594508805
Epoch 7/10, Loss: 0.03657397906310347
Epoch 8/10, Loss: 0.03654496290464259
Epoch 9/10, Loss: 0.036554384811577995
Epoch 10/10, Loss: 0.03651056538212019
