# Test
This is annotation for the 6cwmA chain:
```
6cwm;A;UNKNOWN;F7 F16 S17 G18 Y19 P20 E35 K38 V39 W65 R149;AKTTQEKFDALKEAGVFSGYPGTTDAKLGQDMTRAEFAKVLVKLFGLKEIHGQYSYKDKNYDAKNWAAPFIEAVTAEGLMQAKDLTKKIFDFNGKITVEEASKTLVTALKLEPVKDAQNKATDWAKGYFEAAVNAGLFSKDANPKANATRAQLVEAAFAADEMSKGSGSHH
```

In [1]:
import csv
import os
import pandas as pd
import numpy as np
from emmaemb.core import Emma
from emmaemb.vizualisation import (
    plot_emb_space,
    plot_knn_alignment_scores_across_k_and_distance_metrics,
    plot_pairwise_distance_comparison
)

DATA_PATH = '/home/unix/vkrhk/EmmaEmb/data'
EMBEDDINGS_PATH = '/media/drive2/vkrhk/embeddings'
# [embeddings1_name, embeddings1_path] = "ESM2", f"{EMBEDDINGS_PATH}/esm2_t33_650M_UR50D/layer_33/chopped_1022_overlap_300"
# [embeddings2_name, embeddings2_path] = "ANKH", f"{EMBEDDINGS_PATH}/ankh_base/layer_None/chopped_1022_overlap_300"
# [embeddings1_name, embeddings1_path] = "ProstT5", f"{EMBEDDINGS_PATH}/Rostlab/ProstT5/layer_None/chopped_1022_overlap_300/"
# [embeddings2_name, embeddings2_path] = "ProtT5", f"{EMBEDDINGS_PATH}/Rostlab/prot_t5_xl_uniref50/layer_None/chopped_1022_overlap_300"
[embeddings1_name, embeddings1_path] = "ESM1", f"{EMBEDDINGS_PATH}/esm1_t34_670M_UR100/layer_34/chopped_1022_overlap_300/"
[embeddings2_name, embeddings2_path] = "ESMC", f"{EMBEDDINGS_PATH}/esmc-300m-2024-12/layer_None/chopped_1022_overlap_300/"

# colect data:
feature_data = []
embeddings1 = []
embeddings2 = []
concatenated_embeddings1_path = f"{DATA_PATH}/concatenated-embeddings/{embeddings1_name}_binding_site_embeddings.npy"
concatenated_embeddings2_path = f"{DATA_PATH}/concatenated-embeddings/{embeddings2_name}_binding_site_embeddings.npy"


with open(f'{DATA_PATH}/train.txt', 'r') as f:
    reader = csv.reader(f, delimiter=';')
    for ii, row in enumerate(reader):
        protein_id = row[0] + row[1]
        annotation = row[3].split(' ')
        annotation = [int(i[1:]) for i in annotation]
        sequence = row[4]
        path1 = f"{embeddings1_path}/{protein_id}.npy"
        path2 = f"{embeddings2_path}/{protein_id}.npy"

        if not (os.path.exists(path1) and os.path.exists(path2)):
            continue
        
        embedding1 = np.load(path1)
        embedding2 = np.load(path2)
        
        for i in range(len(sequence)):
            feature_data.append([sequence[i], 'BINDING' if i in annotation else 'NON-BINDING'])
        embeddings1.append(embedding1)
        embeddings2.append(embedding2)
        if ii > 100:
            break
# save embedding spaces to single .npy files
embeddings1 = np.concatenate(embeddings1, axis=0)
embeddings2 =  np.concatenate(embeddings2, axis=0)
np.save(concatenated_embeddings1_path, embeddings1)
np.save(concatenated_embeddings2_path, embeddings2)  

feature_data = pd.DataFrame.from_records(feature_data, columns=["amino acid", "binding_site"])

# initiate Emma object and load embedding spaces
emma = Emma(feature_data=feature_data)
emma.add_emb_space(
    embeddings_source=concatenated_embeddings1_path,
    emb_space_name=embeddings1_name)
emma.add_emb_space(
    embeddings_source=concatenated_embeddings2_path,
    emb_space_name=embeddings2_name)

30537 samples loaded.
Categories in meta data: ['binding_site']
Numerical columns in meta data: []
Embedding space 'ESM1' added successfully.
Embeddings have 1280 features each.
Embedding space 'ESMC' added successfully.
Embeddings have 960 features each.


In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import plotly.express as px

for embeddings_name in [embeddings1_name, embeddings2_name]:
    emb_space = embeddings_name
    method = "PCA"
    embeddings = emma.emb[emb_space]["emb"]
    n_components = 15

    pca = PCA(n_components=n_components)
    embeddings_2d = pca.fit_transform(embeddings)
    variance_explained = pca.explained_variance_ratio_


    fig, axes = plt.subplots(n_components, n_components, figsize=(50, 50))
    for i in range(n_components):
        for ii in range(n_components):
            ax = axes[i, ii]
            scatter = ax.scatter(
                embeddings_2d[:, i],
                embeddings_2d[:, ii],
                c=emma.metadata["binding_site"].astype('category').cat.codes if emma._check_column_is_categorical("binding_site") else emma.metadata["binding_site"],
                cmap='viridis',
                alpha=0.5,
                s=3
            )
            if i == 0:
                ax.set_title(f"PC{ii + 1}")
            if ii == 0:
                ax.set_ylabel(f"PC{i + 1}")
            if i == n_components - 1:
                ax.set_xlabel(f"PC{ii + 1}")
            ax.set_xticks([])
            ax.set_yticks([])
    plt.tight_layout()
    plt.savefig(f'img/{emb_space}_{method}_{n_components}.png')
    plt.close()