In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics.pairwise import cosine_similarity
import random
import torch.nn.functional as F
import os
import numpy as np
import psycopg2
import torch
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder, StandardScaler
from psycopg2.extras import DictCursor
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [3]:
# save to user_data
connection = psycopg2.connect(host="localhost", user="root", port=5432, database="W9sV6cL2dX", password="E5rG7tY3fH")
cursor = connection.cursor(cursor_factory=DictCursor)

In [4]:
user_features_df = pd.read_csv('user_features_df.csv', usecols=['user_id', 'gender_encode', 'country_encode', 'age_normalized'])
user_features_df[:3]

Unnamed: 0,user_id,gender_encode,country_encode,age_normalized
0,46763,0,2,0.178947
1,38208,0,2,0.347368
2,42734,0,5,0.273684


In [5]:
user_in = user_features_df[user_features_df['user_id']==1]
user_in = user_in[0:1]
user_in

Unnamed: 0,user_id,gender_encode,country_encode,age_normalized
17791,1,1,1,0.410526


In [6]:
item_query = """
SELECT *
FROM item_embeddings
WHERE item_id NOT IN (
    SELECT item_id
    FROM time_spent
)
LIMIT 10
"""
cursor.execute(item_query)
item = cursor.fetchall()

In [7]:
unseem_item_df = pd.DataFrame(item, columns=['item_id', 'embedding', 'time_stamp'])
unseem_item_df[:3]

Unnamed: 0,item_id,embedding,time_stamp
0,5739db20-e871-423b-8f4d-2cc7442928b0,"[-1.1496120691299438, -0.09155219793319702, -1...",2023-07-29 13:02:27
1,2c95a22d-1b2c-4baa-a540-8b6c61ea7e34,"[-0.7228033542633057, -0.8741176724433899, -2....",2023-07-29 13:02:27
2,9b4030eb-4194-4ce7-95aa-198674ae52bc,"[-0.46015146374702454, 0.3535095453262329, -0....",2023-07-29 13:02:27


In [8]:
merged_df = pd.concat([user_in] * len(unseem_item_df), ignore_index=True)
merged_df[['item_id', 'embedding', 'time_stamp']] = unseem_item_df
merged_df = merged_df.drop(columns=['time_stamp'])

In [9]:
merged_df

Unnamed: 0,user_id,gender_encode,country_encode,age_normalized,item_id,embedding
0,1,1,1,0.410526,5739db20-e871-423b-8f4d-2cc7442928b0,"[-1.1496120691299438, -0.09155219793319702, -1..."
1,1,1,1,0.410526,2c95a22d-1b2c-4baa-a540-8b6c61ea7e34,"[-0.7228033542633057, -0.8741176724433899, -2...."
2,1,1,1,0.410526,9b4030eb-4194-4ce7-95aa-198674ae52bc,"[-0.46015146374702454, 0.3535095453262329, -0...."
3,1,1,1,0.410526,da380296-8ab7-46c6-9fbc-38ef9180563f,"[-1.0095241069793701, -1.1376439332962036, -1...."
4,1,1,1,0.410526,4e2f0be4-744a-4d46-af49-3fe6813744ed,"[-0.9864667654037476, -0.5571873784065247, -0...."
5,1,1,1,0.410526,d0cd7478-13a7-4cb1-bc43-ce043061ea5b,"[0.3429713845252991, 0.41703441739082336, -0.4..."
6,1,1,1,0.410526,e592cfff-b447-4304-b4a7-3a3df3ed66f5,"[-0.2781822085380554, -0.6354814171791077, 0.3..."
7,1,1,1,0.410526,895e5e92-ba69-4a21-b5b1-2520f33a52fc,"[0.04357937350869179, 0.25889378786087036, -0...."
8,1,1,1,0.410526,0a56d914-4899-4bbf-9cd3-f0e34e3be6dc,"[-2.3657567501068115, 0.006132760085165501, -1..."
9,1,1,1,0.410526,eced7433-d92b-4bcf-94b6-366a790b0079,"[-1.3200820684432983, -0.64692622423172, -0.65..."


In [10]:
user_features = merged_df[['gender_encode', 'country_encode', 'age_normalized']]
item_features = merged_df.apply(lambda row: row['embedding'], axis=1)

In [11]:
user_features = torch.tensor(user_features.values, dtype=torch.float32)
item_features = [torch.tensor(item, dtype=torch.float32) for item in item_features]

In [41]:
# Two-tower model architecture
class TwoTowerNetwork(nn.Module):
    def __init__(self, user_input_dim, item_input_dim, output_dim):
        super(TwoTowerNetwork, self).__init__()

        hidden_dim = 128  # example value, adjust based on your requirement

        # User tower
        self.user_tower = nn.Sequential(
            nn.Linear(user_input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )

        # Item tower
        self.item_tower = nn.Sequential(
            nn.Linear(item_input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )


    def forward(self, user_input, item_input):
        # User tower
        user_output = self.user_tower(user_input)

        # Item tower
        item_output = self.item_tower(item_input)

        # Normalize the embeddings (this is necessary for cosine similarity)
        user_output = F.normalize(user_output, dim=1)
        item_output = F.normalize(item_output, dim=1)
        
        return user_output, item_output
    

# Custom dataset
class CustomDataset(Dataset):
    def __init__(self, merged_df):
        user_features = merged_df[['gender_encode', 'country_encode', 'age_normalized']]
        item_features = merged_df.apply(lambda row: row['embedding'], axis=1)
        self.user_features = torch.tensor(user_features.values, dtype=torch.float32)
        self.item_features = [torch.tensor(item, dtype=torch.float32) for item in item_features]
        self.merged_df = merged_df
        print(self.merged_df)
        
    def __len__(self):
        return len(self.user_features)
    
    def __getitem__(self, idx):
        item_id = merged_df['item_id'][idx]
        return self.user_features[idx], self.item_features[idx], item_id

In [42]:
save_path = '/root/tinier-world/notebooks/testbooks/models/'
# Hyperparameters
user_feature_dim = 3 # age, F/M, country
item_feature_dim = 1000 # item_id, item_type, item_features
embedding_dim = 100

loaded_model = TwoTowerNetwork(user_feature_dim, item_feature_dim, embedding_dim)
loaded_model.load_state_dict(torch.load(save_path + 'two_tower_model_ver1.pth'))
loaded_model.eval()

TwoTowerNetwork(
  (user_tower): Sequential(
    (0): Linear(in_features=3, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=100, bias=True)
  )
  (item_tower): Sequential(
    (0): Linear(in_features=1000, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=100, bias=True)
  )
)

In [43]:
# Create the two-tower model
model = TwoTowerNetwork(user_feature_dim, item_feature_dim, embedding_dim)

# Create the custom dataset and data loader
dataset = CustomDataset(merged_df)
data_loader = DataLoader(dataset, batch_size=1, shuffle=True)

   user_id  gender_encode  country_encode  age_normalized  \
0        1              1               1        0.410526   
1        1              1               1        0.410526   
2        1              1               1        0.410526   
3        1              1               1        0.410526   
4        1              1               1        0.410526   
5        1              1               1        0.410526   
6        1              1               1        0.410526   
7        1              1               1        0.410526   
8        1              1               1        0.410526   
9        1              1               1        0.410526   

                                item_id  \
0  5739db20-e871-423b-8f4d-2cc7442928b0   
1  2c95a22d-1b2c-4baa-a540-8b6c61ea7e34   
2  9b4030eb-4194-4ce7-95aa-198674ae52bc   
3  da380296-8ab7-46c6-9fbc-38ef9180563f   
4  4e2f0be4-744a-4d46-af49-3fe6813744ed   
5  d0cd7478-13a7-4cb1-bc43-ce043061ea5b   
6  e592cfff-b447-4304-b4a7-

In [46]:

for user_feats, item_feats, item_id in data_loader:
    model.eval()
    
    user_embeds, item_embeds = model(user_feats, item_feats)
    cos_sim = nn.functional.cosine_similarity(user_embeds, item_embeds)
    prob = torch.sigmoid(cos_sim.unsqueeze(1))
    print(prob)

('895e5e92-ba69-4a21-b5b1-2520f33a52fc',)
tensor([[0.5004]], grad_fn=<SigmoidBackward0>)
('5739db20-e871-423b-8f4d-2cc7442928b0',)
tensor([[0.5227]], grad_fn=<SigmoidBackward0>)
('e592cfff-b447-4304-b4a7-3a3df3ed66f5',)
tensor([[0.4974]], grad_fn=<SigmoidBackward0>)
('4e2f0be4-744a-4d46-af49-3fe6813744ed',)
tensor([[0.4990]], grad_fn=<SigmoidBackward0>)
('d0cd7478-13a7-4cb1-bc43-ce043061ea5b',)
tensor([[0.5025]], grad_fn=<SigmoidBackward0>)
('da380296-8ab7-46c6-9fbc-38ef9180563f',)
tensor([[0.5133]], grad_fn=<SigmoidBackward0>)
('0a56d914-4899-4bbf-9cd3-f0e34e3be6dc',)
tensor([[0.5295]], grad_fn=<SigmoidBackward0>)
('2c95a22d-1b2c-4baa-a540-8b6c61ea7e34',)
tensor([[0.4952]], grad_fn=<SigmoidBackward0>)
('9b4030eb-4194-4ce7-95aa-198674ae52bc',)
tensor([[0.5173]], grad_fn=<SigmoidBackward0>)
('eced7433-d92b-4bcf-94b6-366a790b0079',)
tensor([[0.5051]], grad_fn=<SigmoidBackward0>)


Unnamed: 0,item_id,embedding
0,5739db20-e871-423b-8f4d-2cc7442928b0,"[-1.1496120691299438, -0.09155219793319702, -1..."
1,46547842-79f7-4f96-957a-3a29ad9a2f3a,"[-1.7329914569854736, 0.5699731111526489, -1.9..."
2,2c95a22d-1b2c-4baa-a540-8b6c61ea7e34,"[-0.7228033542633057, -0.8741176724433899, -2...."


In [None]:
unseen_item_embeddings = np.array(data_df['embedding'].tolist())  #np.array(data_df.apply(lambda row: row['embedding'], axis=1)) 
unseen_item_embeddings = data_df.apply(lambda row: row['embedding'], axis=1)
unseen_item_embeddings[0]