In [None]:
#User Representation

import numpy as np
import torch
import matplotlib.pyplot as plt
from cuml.manifold import TSNE
import pandas as pd
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

df = pd.read_csv('/kaggle/input/diss-train/data.test.csv')

MODELS = {
    'ST5-Only': 'sentence-transformers/sentence-t5-base',
    'ST5-Final': '/kaggle/input/models/models/final_model'
}
 
# Define domains
domains = ['All_Beauty', 'Baby_Products', 'Video_Games',
           'Beauty_and_Personal_Care', 'Cell_Phones_and_Accessories',
           'Electronics', 'Health_and_Household', 'Movies_and_TV', 'Toys_and_Games']


# Colors for the 9 domains
colors = ['red', 'blue', 'green', 'orange', 'purple', 'brown', 'pink', 'gray', 'olive']

def get_top_users(df, domain, n=1000): #get 1000 users from each domain
    domain_df = df[df['category'] == domain]
    top_users = domain_df['parent_asin'].value_counts().nlargest(n).index
    return domain_df[domain_df['parent_asin'].isin(top_users)]['history_text'].unique()[:n]

def create_embeddings(model, users):
    return model.encode(users, convert_to_tensor=True, show_progress_bar=True)

def visualize_embeddings(embeddings1, embeddings2, labels, domains, title1, title2, seed):
    tsne = TSNE(n_components=2, random_state=seed)
    reduced_embeddings1 = tsne.fit_transform(embeddings1)
    reduced_embeddings2 = tsne.fit_transform(embeddings2)

    fig, axs = plt.subplots(1, 2, figsize=(16, 8), sharey=True)
    

    for i, domain in enumerate(domains):
        mask = labels == i
        axs[0].scatter(reduced_embeddings1[mask, 0], reduced_embeddings1[mask, 1], 
                       c=[colors[i]], label=domain, alpha=0.6)
    axs[0].set_title(title1) 

    # Plot for the second model
    for i, domain in enumerate(domains):
        mask = labels == i
        axs[1].scatter(reduced_embeddings2[mask, 0], reduced_embeddings2[mask, 1], 
                       c=[colors[i]], label=domain, alpha=0.6)
    axs[1].set_title(title2)  

    # Common legend at the bottom
    handles, labels = axs[0].get_legend_handles_labels()  
    fig.legend(handles, labels, loc='lower center', bbox_to_anchor=(0.5, -0.05), ncol=len(domains),title="Categories")
    plt.tight_layout(rect=[0, 0.03, 1, 1]) 
    plt.savefig('/kaggle/working/user-rep.png', bbox_inches='tight')
    plt.show()

all_embeddings = {model_name: [] for model_name in MODELS}
labels = []

for i, domain in enumerate(tqdm(domains, desc="Processing domains")):
    users = get_top_users(df, domain)
    labels.extend([i] * len(users))

    for model_name, model_path in MODELS.items():
        model = SentenceTransformer(model_path)
        embeddings = create_embeddings(model, users)
        all_embeddings[model_name].append(embeddings.cpu().detach().numpy())

for model_name in MODELS:
    all_embeddings[model_name] = np.concatenate(all_embeddings[model_name], axis=0)



seeds = [123] 
for seed in seeds:
    model_names = list(all_embeddings.keys())
    visualize_embeddings(all_embeddings[model_names[0]], 
                         all_embeddings[model_names[1]], 
                         np.array(labels), domains,
                         f"User Representations - {model_names[0]} Model",
                         f"User Representations - {model_names[1]} Model",
                         seed)


In [None]:
#ITem Representation

import numpy as np
import torch
import matplotlib.pyplot as plt
from cuml.manifold import TSNE
import pandas as pd
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer
import os

# Load your DataFrame
df = pd.read_csv('/kaggle/input/diss-train/data.test.csv')

# Define models
MODELS = {
    'ST5-Only': 'sentence-transformers/sentence-t5-base',
    'ST5-Final': '/kaggle/input/models/models/final_model',
    'UniSRec(BLAIR)': 'hyp1231/blair-roberta-base'
}

# Define domains
domains = ['All_Beauty', 'Baby_Products', 'Video_Games',
           'Beauty_and_Personal_Care', 'Cell_Phones_and_Accessories',
           'Electronics', 'Health_and_Household', 'Movies_and_TV', 'Toys_and_Games']

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
output_dir = '/kaggle/working/'

# Colors for the 9 domains
colors = ['red', 'blue', 'green', 'orange', 'purple', 'brown', 'pink', 'gray', 'olive']


def get_items(df, domain): #get 1000 items from each domain
    domain_df = df[df['category'] == domain]
    texts = domain_df['target_item_text'].head(1000).drop_duplicates().tolist()
    return texts

def create_embeddings_st(model, texts):
    return model.encode(texts, convert_to_tensor=True, show_progress_bar=True)

def create_embeddings_blair(model, tokenizer, texts):
    cleaned_texts = [text.replace('<extra_id_1>', '').replace('<extra_id_2>', '') for text in texts]
    inputs = tokenizer(cleaned_texts, padding=True, truncation=True, max_length=512, return_tensors='pt').to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :]

def visualize_embeddings(embeddings, domain_labels, title, seed):
    tsne = TSNE(n_components=2, random_state=seed)
    reduced_embeddings = tsne.fit_transform(embeddings)
    plt.figure(figsize=(12, 8))
    for i, domain in enumerate(domains):
        domain_mask = domain_labels == i
        plt.scatter(reduced_embeddings[domain_mask, 0], reduced_embeddings[domain_mask, 1], 
                    c=colors[i],label=domain, alpha=0.4)
    
    print('SEED:',seed)
    plt.title(f"{title}")
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()

all_embeddings = {model_name: [] for model_name in MODELS}
domain_labels = []

# Initialize BLAIR model and tokenizer
blair_tokenizer = AutoTokenizer.from_pretrained(MODELS['UniSRec(BLAIR)'])
blair_model = AutoModel.from_pretrained(MODELS['UniSRec(BLAIR)']).to(device)

# Calculate embeddings
for i, domain in enumerate(tqdm(domains, desc="Processing domains")):
    texts = get_items(df, domain)
    print(f"{domain}: {len(texts)}")
    for model_name, model_path in MODELS.items():
        if model_name == 'UniSRec(Blair)':
            embeddings = create_embeddings_blair(blair_model, blair_tokenizer, texts)
        else:
            model = SentenceTransformer(model_path)
            embeddings = create_embeddings_st(model, texts)
        all_embeddings[model_name].append(embeddings)
    domain_labels.extend([i] * len(texts))

# Combine embeddings
for model_name in MODELS:
    all_embeddings[model_name] = torch.cat(all_embeddings[model_name], dim=0).cpu().detach().numpy()

# Save BLAIR embeddings
for i, domain in enumerate(domains):
    domain_embeddings = all_embeddings['UniSRec(BLAIR)'][i*1000:(i+1)*1000]
    file_path = os.path.join(output_dir, f'{domain}.blair.feature')
    domain_embeddings.tofile(file_path)
    print(f"Saved BLAIR embeddings for {domain} to {file_path}")

# Visualize with different seeds
seeds = [42, 123, 456] 
for seed in seeds:
    for model_name, embeddings in all_embeddings.items():
        visualize_embeddings(embeddings, np.array(domain_labels), f"Item Representations - {model_name}", seed)