In [None]:
import torch
import torch.nn as nn

# Define a simple neural network for dimensionality reduction
class DimensionReducer(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DimensionReducer, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)
        self.tanh = nn.Tanh()

    def forward(self, x):
        x = self.linear(x)
        output = self.tanh(x)
        return output

# Specify input and output dimensions for reduction
input_dim = 768  # BERT embedding size
output_dim = 8  # Reduced dimensionality

# Create the dimension reduction model
reducer_model = DimensionReducer(input_dim, output_dim)

In [None]:
# Preloaded from disk
import pandas as pd
import numpy as np
import pickle

movies_ratings_and_tags = pd.read_csv("../data/movies_ratings_and_tags_mlens_small.csv")
movies_ratings_and_tags.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
# Load the current train and test trajectories with action space's shape 768
import pickle
with open("/home/ssk/Desktop/master-thesis/master-thesis-personalization/data/dt-datasets/movielens/train-test-sets/mlens-train-trajectories-movies-as-actions.pkl", 'rb') as f:
    train_trajectories = pickle.load(f)

with open("/home/ssk/Desktop/master-thesis/master-thesis-personalization/data/dt-datasets/movielens/train-test-sets/mlens-test-trajectories-movies-as-actions.pkl", 'rb') as f:
    test_trajectories = pickle.load(f)

##### Create a vocabulary

In [None]:
from tqdm import tqdm

movie_embed_to_id = {}
for traj in tqdm(train_trajectories):
    user_id = traj['user_id']
    movie_ids = (movies_ratings_and_tags[movies_ratings_and_tags['userId'] == user_id]['movieId']).tolist()
    movie_embeddings = torch.from_numpy(traj['actions'])
    embeddings_flattened = movie_embeddings.view(-1, input_dim)
    reduced_embeddings = reducer_model(embeddings_flattened).detach().numpy()
    traj['actions'] = reduced_embeddings
    for (mid, red_embed) in zip(movie_ids, reduced_embeddings):
        movie_embed_to_id[tuple(red_embed)] = mid

for traj in tqdm(test_trajectories):
    user_id = traj['user_id']
    movie_ids = (movies_ratings_and_tags[movies_ratings_and_tags['userId'] == user_id]['movieId']).tolist()
    movie_embeddings = torch.from_numpy(traj['actions'])
    embeddings_flattened = movie_embeddings.view(-1, input_dim)
    reduced_embeddings = reducer_model(embeddings_flattened).detach().numpy()
    traj['actions'] = reduced_embeddings
    for (mid, red_embed) in zip(movie_ids, reduced_embeddings):
        movie_embed_to_id[tuple(red_embed)] = mid

with open(f"../data/dt-datasets/movielens/processed-data/movie_embed_with_shape_{output_dim}_to_id_mapping_with_tanh.pkl", 'wb') as f:
    pickle.dump(movie_embed_to_id, f)

In [None]:
# with open(f"../data/dt-datasets/movielens/processed-data/movie_embed_with_shape_{output_dim}_to_id_mapping.pkl", 'rb') as f:
#     movie_embed_to_id = pickle.load(f)

# Create a vocab of all movies and save
all_actions = list(movie_embed_to_id.keys())
action_vocab = np.array(all_actions)

with open (f"../data/dt-datasets/movielens/processed-data/action_vocab_of_shape_{output_dim}_with_tanh.pkl", 'wb') as f:
    pickle.dump(action_vocab, f)

In [None]:
train_trajectories[0]['actions'].shape[1]

In [None]:
# Save the 'movies as actions' trajectories
# Save the train and test trajectories as pickle files to load them later
import pickle
with open(f'../data/dt-datasets/movielens/train-test-sets/mlens-train-trajectories-movies-as-actions-reduced-from-{input_dim}-to-{output_dim}_with_tanh.pkl', 'wb') as f:
    pickle.dump(train_trajectories, f)
with open(f'../data/dt-datasets/movielens/train-test-sets/mlens-test-trajectories-movies-as-actions-reduced-from-{input_dim}-to-{output_dim}_with_tanh.pkl', 'wb') as f:
    pickle.dump(test_trajectories, f)

In [None]:
import numpy as np

# Example group of vectors
vectors = np.array([
    [1, 2, 3],
    [4, 5, 6],
    [7, 8, 9],
    [10, 11, 12]
])

# Example single vector
single_vector = np.array([2, 3, 4])

# Calculate cosine similarity between the single vector and each vector in the group
# Cosine similarity formula: dot product of vectors / (magnitude of vector1 * magnitude of vector2)
# Using np.dot() for dot product and np.linalg.norm() for calculating magnitudes
similarities = np.dot(vectors, single_vector) / (np.linalg.norm(vectors, axis=1) * np.linalg.norm(single_vector))

# The 'similarities' array now contains the cosine similarities between the single vector and each vector in the group
print("Cosine similarities:", similarities)


In [None]:
import numpy as np

# Example group of vectors
vectors = np.array([
    [1, 2, 3],
    [4, 5, 6],
    [7, 8, 9],
    [10, 11, 12]
])

# Example single vector
single_vector = np.array([2, 3, 4])

# Calculate cosine similarity between the single vector and each vector in the group
similarities = np.dot(vectors, single_vector) / (np.linalg.norm(vectors, axis=1) * np.linalg.norm(single_vector))

# Find the index of the most similar vector
most_similar_index = np.argmax(similarities)

# Retrieve the most similar vector from the group
most_similar_vector = vectors[most_similar_index]

# The 'most_similar_vector' is the vector from the group most similar to the 'single_vector'
print("Most similar vector:", most_similar_vector)


In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Example group of vectors
vectors = np.array([
    [1, 2, 3],
    [4, 5, 6],
    [7, 8, 9],
    [10, 11, 12]
])

# Example single vector
single_vector = np.array([2, 3, 4]).reshape(1, -1)  # Reshape to match sklearn input format

# Calculate cosine similarity between the single vector and each vector in the group
similarities = cosine_similarity(vectors, single_vector)

# Find the index of the most similar vector
most_similar_index = np.argmax(similarities)

# Retrieve the most similar vector from the group
most_similar_vector = vectors[most_similar_index]

# The 'most_similar_vector' is the vector from the group most similar to the 'single_vector'
print("Most similar vector:", most_similar_vector)


In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Example groups of vectors
group1 = np.array([
    [1, 2, 3],
    [4, 5, 6],
    [7, 8, 9],
    [10, 11, 12]
])

group2 = np.array([
    [2, 3, 4],
    [5, 6, 7],
    [8, 9, 10],
    [11, 12, 13]
])

# Initialize an array to store the most similar vectors from group1 for each vector in group2
most_similar_vectors_group1 = []

# Iterate through each vector in group2
for vec in group2:
    # Calculate cosine similarity between the current vector in group2 and all vectors in group1
    similarities = cosine_similarity([vec], group1)

    # Find the index of the most similar vector in group1
    most_similar_index = np.argmax(similarities)
    
    # Retrieve the most similar vector from group1
    most_similar_vector_group1 = group1[most_similar_index]
    
    # Append the most similar vector from group1 to the list
    most_similar_vectors_group1.append(most_similar_vector_group1)

# Convert the list of most similar vectors to a NumPy array
most_similar_vectors_group1 = np.array(most_similar_vectors_group1)

# Display the most similar vectors from group1 for each vector in group2
print("Most similar vectors from group1 for each vector in group2:")
print(most_similar_vectors_group1)


In [None]:
vocab = torch.randn(20, 3)
vocab.shape

In [None]:
a = torch.randn(2, 2, 3)
a = a.view(-1, a.shape[2])
a.shape

In [None]:
cosine_similarity(vocab, a)

In [None]:
similarities = cosine_similarity(a, vocab)
similarities

In [None]:
indices = np.argmax(similarities, axis=1)

In [None]:
vocab

In [None]:
vocab[indices]

In [None]:
np.argmax(similarities[0])

In [None]:
np.argsort(-similarities, axis=1)[:, :1]