load target client

In [None]:
import numpy as np

# Load the .npy file
array = np.load('ubc/relevant_clients.npy')

# Convert to a Python list
client_ids_list = array.tolist()
len(client_ids_list)

create input and target vocabs

In [None]:
# @title create input and target vocabs
import pandas as pd

# Load CSV
df = pd.read_csv("recsys_data.csv")

# Keep only the desired columns
df = df[['input_seq', 'target_seq']]

df = df.dropna(subset=['input_seq', 'target_seq'])


min_freq = 2
unk_token = "<unk>"
pad_token = "<pad>"
sos_token = "<sos>"
eos_token = "<eos>"
special_tokens = [
    unk_token,
    pad_token,
    sos_token,
    eos_token,
]

input_toks = [unk_token, pad_token, sos_token, eos_token]
target_toks = [unk_token, pad_token, sos_token, eos_token]
for row in df.itertuples(index=False):
  target_seq = row.target_seq
  input_seq = row.input_seq
  for i in target_seq.split():
    if i not in target_toks:
      target_toks.append(i)
  for i in input_seq.split():
    if i not in input_toks:
      input_toks.append(i)

input_toks.sort()
target_toks.sort()

input_vocab = {tok:idx for idx, tok in enumerate(input_toks)}
target_vocab = {tok:idx for idx, tok in enumerate(target_toks)}

initial models

In [None]:
# @title init models
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np

class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.rnn(embedded)
        return hidden, cell

class Decoder(nn.Module):
  def __init__(self, output_dim, embedding_dim, hidden_dim, n_layers, dropout):
      super().__init__()
      self.output_dim = output_dim
      self.hidden_dim = hidden_dim
      self.n_layers = n_layers
      self.embedding = nn.Embedding(output_dim, embedding_dim)
      self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout)
      self.fc_out = nn.Linear(hidden_dim, output_dim)
      self.dropout = nn.Dropout(dropout)

  def forward(self, input, hidden, cell):
      input = input.unsqueeze(0)
      embedded = self.dropout(self.embedding(input))
      output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
      prediction = self.fc_out(output.squeeze(0))
      return prediction, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        assert (
            encoder.hidden_dim == decoder.hidden_dim
        ), "Hidden dimensions of encoder and decoder must be equal!"
        assert (
            encoder.n_layers == decoder.n_layers
        ), "Encoder and decoder must have equal number of layers!"

    def forward(self, src, trg, teacher_forcing_ratio):
        batch_size = trg.shape[1]
        trg_length = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(trg_length, batch_size, trg_vocab_size).to(self.device)
        hidden, cell = self.encoder(src)
        input = trg[0, :]
        for t in range(1, trg_length):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[t] if teacher_force else top1
        return outputs


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
hidden_size = 768
encoder = Encoder(129, 256, hidden_size, 2, 0.5,)
decoder = Decoder(189, 256, hidden_size, 2, 0.5,)
model = Seq2Seq(encoder, decoder, device).to(device)

encoder = Encoder(189, 256, hidden_size, 2, 0.5,)
decoder = Decoder(129, 256, hidden_size, 2, 0.5,)
model_revesed = Seq2Seq(encoder, decoder, device).to(device)

checkpoint = torch.load("recsys_model_last.pt", map_location=torch.device('cpu'))
model.load_state_dict(checkpoint['model_state_dict'])
print(f"model recsys_model_last epoch: {checkpoint['epoch']}")
checkpoint = torch.load("recsys_model_last_reversed.pt", map_location=torch.device('cpu'))
model_revesed.load_state_dict(checkpoint['model_state_dict'])
print(f"model recsys_model_last_reversed epoch: {checkpoint['epoch']}")
model.to(device)
model_revesed.to(device)

print("all models loaded!")


In [None]:
from collections import defaultdict

dict_rdata = {}
rdata = open("recsys_data.csv").readlines()[1:]
for i in rdata:
  c_id = int(i.split(",")[0].strip())
  dict_rdata[c_id] = i

all_embeddings1 = np.zeros((1_000_000, 768), dtype=np.float32) #model A
all_embeddings2 = np.zeros((1_000_000, 768), dtype=np.float32) #model B

for idx, id in enumerate(client_ids_list):
  if (idx+1)%50_000 == 0:
    print(idx+1)

  if id not in dict_rdata:
    continue

  input_seq = dict_rdata[id].split(",")[1].strip().strip("\n")
  target_seq = dict_rdata[id].split(",")[2].strip().strip("\n")

  with torch.no_grad():
    #input
    tokens = input_seq.strip().strip("\n").split()
    tokens = ["<sos>"] + tokens + ["<eos>"]
    ids = [input_vocab[i] for i in tokens]
    tensor = torch.LongTensor(ids).unsqueeze(-1).to(device)
    hidden, cell = model.encoder(tensor)
    hidden = hidden.squeeze((0, 1))
    mean_vec = (hidden[0] + hidden[1])/2
    embed_in = mean_vec.detach().cpu().numpy()

    #target
    tokens = target_seq.strip().strip("\n").split()
    tokens = ["<sos>"] + tokens + ["<eos>"]
    ids = [target_vocab[i] for i in tokens]
    tensor = torch.LongTensor(ids).unsqueeze(-1).to(device)
    hidden, cell = model_revesed.encoder(tensor)
    hidden = hidden.squeeze((0, 1))
    mean_vec = (hidden[0] + hidden[1])/2
    embed_tar = mean_vec.detach().cpu().numpy()
    all_embeddings1[idx] = embed_in
    all_embeddings2[idx] = embed_tar



Average two model embeddings

In [None]:
all_embeddings = (all_embeddings1 + all_embeddings2)/2

create final submition

In [None]:
import numpy as np

all_embeddings_fp16 = all_embeddings.astype(np.float16)
np.save('embeddings.npy', all_embeddings_fp16)
mv relevant_clients.npy client_ids.npy
zip -r embeddings.zip embeddings.npy client_ids.npy