In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics.pairwise import cosine_similarity
import random
import torch.nn.functional as F
import os
import numpy as np
import psycopg2
import torch
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder, StandardScaler
from psycopg2.extras import DictCursor
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [2]:
# save to user_data
connection = psycopg2.connect(host="localhost", user="root", port=5432, database="W9sV6cL2dX", password="E5rG7tY3fH")
cursor = connection.cursor(cursor_factory=DictCursor)

In [3]:

select_query = "SELECT ts.user_id, ts.item_id, ts.time_spent, u.*, ie.* \
FROM time_spent AS ts \
JOIN users AS u ON ts.user_id = u.id \
JOIN item_embeddings AS ie ON ts.item_id = ie.item_id;"
cursor.execute(select_query)
data = cursor.fetchall()


In [4]:
data_col = ['user_id','item_id', 'time_spent','id' ,'gender', 'country', 'age', 'item_id2','embedding','time_stamp']
data_df = pd.DataFrame(data, columns=data_col)
data_df = data_df.drop(columns=['id', 'time_stamp', 'item_id2'])
data_df[:3]

Unnamed: 0,user_id,item_id,time_spent,gender,country,age,embedding
0,22857,af427d2e-d34c-40a7-aabf-ceab900e3389,0,female,FibonacciFlats,50.0,"[-0.20604084432125092, -0.8583592772483826, -1..."
1,43885,cb91fe20-d5df-4a80-b173-bb07bf5c5134,0,,,,"[-0.7141339778900146, -1.4657678604125977, -1...."
2,43273,361ebe6a-294b-4c15-8935-21c6c2709a45,0,male,FibonacciFlats,40.0,"[-0.8985607028007507, 0.19602474570274353, 0.4..."


In [5]:
countries = list(set(data_df.country))
genders = list(set(data_df.gender))
gender_encoder = LabelEncoder()
gender_encoder.fit(genders)
country_encoder = LabelEncoder()
country_encoder.fit(countries)
age_scaler = MinMaxScaler()
time_scaler = MinMaxScaler()

data_df['gender_encode'] = gender_encoder.transform(data_df['gender'])
data_df['country_encode'] = country_encoder.transform(data_df['country'])
data_df['age_normalized'] = age_scaler.fit_transform(data_df[['age']])
data_df['time_normalized'] = age_scaler.fit_transform(data_df[['time_spent']])
data_df[:5]

Unnamed: 0,user_id,item_id,time_spent,gender,country,age,embedding,gender_encode,country_encode,age_normalized,time_normalized
0,22857,af427d2e-d34c-40a7-aabf-ceab900e3389,0,female,FibonacciFlats,50.0,"[-0.20604084432125092, -0.8583592772483826, -1...",0,3,0.544444,0.0
1,43885,cb91fe20-d5df-4a80-b173-bb07bf5c5134,0,,,,"[-0.7141339778900146, -1.4657678604125977, -1....",3,8,,0.0
2,43273,361ebe6a-294b-4c15-8935-21c6c2709a45,0,male,FibonacciFlats,40.0,"[-0.8985607028007507, 0.19602474570274353, 0.4...",1,3,0.433333,0.0
3,14230,31bebb98-9202-4e78-b193-c1d7e650360d,0,,,,"[-1.7180352210998535, 0.4146166145801544, -0.5...",3,8,,0.0
4,19641,3a72d1cf-02b9-4b6b-bf0d-858961c107e9,0,male,GraphTown,47.0,"[-1.584043264389038, -1.3862628936767578, -2.0...",1,4,0.511111,0.0


In [7]:
item_df = data_df.apply(lambda row: np.append(row['item_id'], row['embedding']), axis=1)


In [13]:
len(df_test[0])

1001

In [9]:
user_features = data_df[['gender_encode', 'country_encode', 'age_normalized']]
item_features = data_df.apply(lambda row: np.append(row['item_id'], row['embedding']), axis=1) #data_df[['item_id', 'embedding']]
targets = data_df['time_normalized']

In [12]:
# user_features[:3]
item_features[:3]
# targets[:3]

0    [af427d2e-d34c-40a7-aabf-ceab900e3389, -0.2060...
1    [cb91fe20-d5df-4a80-b173-bb07bf5c5134, -0.7141...
2    [361ebe6a-294b-4c15-8935-21c6c2709a45, -0.8985...
dtype: object

In [17]:
def seed_everything(seed=42):
    random.seed(seed)
    torch.manual_seed(seed) #for CPU
    torch.cuda.manual_seed(seed) #for GPU
    torch.backends.cudnn.deterministic = True

# Two-tower model architecture
class TwoTowerNetwork(nn.Module):
    def __init__(self, user_input_dim, item_input_dim, output_dim):
        super(TwoTowerNetwork, self).__init__()

        hidden_dim = 128  # example value, adjust based on your requirement

        # User tower
        self.user_tower = nn.Sequential(
            nn.Linear(user_input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )

        # Item tower
        self.item_tower = nn.Sequential(
            nn.Linear(item_input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )

        self.relu = nn.ReLU()

    def forward(self, user_input, item_input):
        # User tower
        user_output = self.user_tower(user_input)

        # Item tower
        item_output = self.item_tower(item_input)

        # Normalize the embeddings (this is necessary for cosine similarity)
        user_output = F.normalize(user_output, dim=1)
        item_output = F.normalize(item_output, dim=1)

        return user_output, item_output

# Custom dataset
class CustomDataset(Dataset):
    def __init__(self, user_features, item_features, targets):
        # df_test = data_df.apply(lambda row: np.append(row['item_id'], row['embedding']), axis=1)

        self.user_features = user_features
        self.item_features = item_features
        self.targets = targets
        
    def __len__(self):
        return len(self.targets)
    
    def __getitem__(self, idx):
        return self.user_features[idx], self.item_features[idx], self.targets[idx]

In [18]:
# Hyperparameters
user_feature_dim = 3 # age, F/M, country
item_feature_dim = 1001 # item_id, item_type, item_features
embedding_dim = 100
batch_size = 32
learning_rate = 0.001
num_epochs = 50
seed_everything(seed=666)

In [19]:
# Create the two-tower model
model = TwoTowerNetwork(user_feature_dim, item_feature_dim, embedding_dim)

# Create the custom dataset and data loader
dataset = CustomDataset(item_features, item_features,targets)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Initialize the optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.MSELoss()

KeyError: 'new_feature'

In [6]:
# Training loop
for epoch in range(num_epochs):
    total_loss = 0.0
    for user_feats, item_feats, targets in data_loader:
        model.train()
        optimizer.zero_grad()
        
        user_embeds, item_embeds = model(user_feats, item_feats)
        cos_sim = nn.functional.cosine_similarity(user_embeds, item_embeds)
        loss = criterion(cos_sim.unsqueeze(1), targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(data_loader)}")

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 1/50, Loss: 0.34596234234049916
Epoch 2/50, Loss: 0.2722171812783927
Epoch 3/50, Loss: 0.24700147286057472
Epoch 4/50, Loss: 0.22438882989808917
Epoch 5/50, Loss: 0.21077052131295204
Epoch 6/50, Loss: 0.19836670230142772
Epoch 7/50, Loss: 0.18632018263451755
Epoch 8/50, Loss: 0.17391128418967128
Epoch 9/50, Loss: 0.17174197966232896
Epoch 10/50, Loss: 0.15881346142850816
Epoch 11/50, Loss: 0.1563244671560824
Epoch 12/50, Loss: 0.15171588864177465
Epoch 13/50, Loss: 0.1391624074894935
Epoch 14/50, Loss: 0.137031233869493
Epoch 15/50, Loss: 0.1351512351538986
Epoch 16/50, Loss: 0.12719742092303932
Epoch 17/50, Loss: 0.12180925044231117
Epoch 18/50, Loss: 0.12113561015576124
Epoch 19/50, Loss: 0.1166975584346801
Epoch 20/50, Loss: 0.11368030915036798
Epoch 21/50, Loss: 0.11073828092776239
Epoch 22/50, Loss: 0.10603262367658317
Epoch 23/50, Loss: 0.1043703742325306
Epoch 24/50, Loss: 0.10124758118763566
Epoch 25/50, Loss: 0.09947195276618004
Epoch 26/50, Loss: 0.1015332406386733
Epoc