In [None]:
import torch
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForMaskedLM
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_checkpoint = "esm2_t6_8M"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint).to(device)

sequence_name = pd.read_csv("clu90_test.csv").iloc[:, 0]
sequence = pd.read_csv("clu90_test.csv").iloc[:, -1]

def generate_embeddings(sequence, max_length=170):
    inputs = tokenizer(sequence, return_tensors="pt", padding='max_length', max_length=max_length, truncation=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.logits.view(-1).cpu().numpy()

embeddings = []
for seq in tqdm(sequence):
    emb = generate_embeddings(seq)
    embeddings.append(emb)

embeddings = np.vstack(embeddings)

# T-SNE
tsne = TSNE(n_components=2, random_state=42)
embeddings_2d = tsne.fit_transform(embeddings)
tsne_df = pd.DataFrame(embeddings_2d, columns=['Component 1', 'Component 2'])
tsne_df['name'] = sequence_name
# tsne_df.to_csv('tsne.csv', index=False)

# Plot
plt.figure(figsize=(10, 8))
plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], alpha=0.5)
plt.title('T-SNE of Sequence Embeddings')
plt.xlabel('Component 1')
plt.ylabel('Component 2')
# plt.savefig('tsne.png', dpi=300)
plt.show()