In [None]:
!pip install h5py pandas matplotlib matplotlib-venn scikit-learn

In [1]:
import h5py
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import json
from matplotlib_venn import venn3

ModuleNotFoundError: No module named 'h5py'

In [None]:
def read_json_dataset(path):
    dataset = {}
    with open(path) as f:
        for line in f:
            tmp = json.loads(line)
            dataset[tmp["ID"]] = tmp
    return dataset

def print_prot(json_dataset, csv_dataset, prot_id):
    print("ID: ", prot_id)
    if not json_dataset is None:
        print("entryID: ", json_dataset[prot_id]['entryID'])
        print("stID: ", json_dataset[prot_id]['stID'])
        print("entity_assemID: ", json_dataset[prot_id]['entity_assemID'])
        print("entityID: ", json_dataset[prot_id]['entityID'])
        print("exp_method: ", json_dataset[prot_id]['exp_method'])
        print("exp_method_subtype: ", json_dataset[prot_id]['exp_method_subtype'])
        print("citation_DOI: ", json_dataset[prot_id]['citation_DOI'])
        print("citation_title: ", json_dataset[prot_id]['citation_title'])
        print("ionic_strength: ", json_dataset[prot_id]['ionic_strength'])
        print("pH: ", json_dataset[prot_id]['pH'])
        print("temperature: ", json_dataset[prot_id]['temperature'])
        print("off_C: ", json_dataset[prot_id]['off_C'])
        print("off_CA: ", json_dataset[prot_id]['off_CA'])
        print("off_CB: ", json_dataset[prot_id]['off_CB'])
        print("off_H: ", json_dataset[prot_id]['off_H'])
        print("off_HA: ", json_dataset[prot_id]['off_HA'])
        print("off_HB: ", json_dataset[prot_id]['off_HB'])
        print("off_N: ", json_dataset[prot_id]['off_N'])
        print("bbshift_positions_post: ", json_dataset[prot_id]['bbshift_positions_post'])
        print("bbshift_types_post: ", json_dataset[prot_id]['bbshift_types_post'])
        print("total_bbshifts: ", json_dataset[prot_id]['total_bbshifts'])
    # I could extracted data from json, but I am too lazy to parse it. Via pandas everything is parsed automatically
    if not csv_dataset is None:
        print(csv_dataset[csv_dataset['ID'] == prot_id].iloc[:, 6:])

In [None]:
prott5_embs = h5py.File("data/disorder/embeddings/unfiltered_all_prott5.h5", "r")

In [None]:
prott5_embs['10006_1_1_1'][1]

In [None]:
prott5_embs_res = h5py.File("data/disorder/embeddings/unfiltered_all_prott5_res.h5", "r")

In [None]:
prott5_embs_res['10001_1_1_1']

In [None]:
# just embeddings for all proteins we have in the dataset
len(prott5_embs.keys())

In [None]:
prostt5 = h5py.File("data/disorder/embeddings/prostt5.h5", "r")

In [None]:
prostt5

In [None]:
prott5_embs.keys()

In [None]:
prott5_embs['10005_1_1_1']

In [None]:
# we have the same data in json and csv, but structured slightly differently

unfiltered = read_json_dataset("data/disorder/unfiltered.json")
moderate = read_json_dataset("data/disorder/moderate.json")
tolerant = read_json_dataset("data/disorder/tolerant.json")
strict = read_json_dataset("data/disorder/strict.json")

unfiltered_csv = pd.read_csv("data/disorder/unfiltered.csv")

In [None]:
strict[strict_ids[0]]

In [None]:
# generating set of IDs
unfiltered_ids = unfiltered.keys()
moderate_ids = moderate.keys()
tolerant_ids = tolerant.keys()
strict_ids = strict.keys()

In [None]:
len(strict_ids), len(moderate_ids), len(tolerant_ids), len(unfiltered_ids)

In [None]:
strict_ids

### Datasets are included in each other

strict ⊂ moderate ⊂ tolerant ⊂ unfiltered

counts:
1. Strict: 1910
2. Moderate: 9807
3. Tolerant: 13943
4. Unfiltered: 15320

In [None]:
# buildung venn diagram for moderate, tolerant and strict
venn3([moderate_ids, tolerant_ids, strict_ids], set_labels=('moderate', 'tolerant', 'strict'));

In [None]:
# buildung venn diagram for tolerant, moderate and unfiltered
venn3([tolerant_ids, moderate_ids, unfiltered_ids], set_labels=('tolerant', 'moderate', 'unfiltered'));

### All data is contained in unfiltered.json, all entries in different subdatasets are the same
Subdatasets just provide splits for proteins

In [None]:
# checking, if all entries are the same in the datasets
for entry in strict.keys():
    if strict[entry] != moderate[entry] or strict[entry] != tolerant[entry] or strict[entry] != unfiltered[entry]:
        print(entry)

In [None]:
prot_id = '36025_1_1_1'


In [None]:
print_prot(unfiltered, unfiltered_csv, prot_id)

In [None]:
prostt5 = h5py.File("data/disorder/embeddings/prostt5.h5", "r")
prostt5_np = np.array(prostt5)


In [None]:
# Load the data
prostt5 = h5py.File("data/disorder/embeddings/prostt5.h5", "r")

# Averaging the embeddings
averaged_embeddings = []
indices = []  # To keep track of the indices

for key in prostt5.keys():
    if key not in strict_ids:
        continue
    embedding = np.array(prostt5[key])
    averaged_embedding = np.mean(embedding, axis=0)  # Average along the amino acids
    averaged_embeddings.append(averaged_embedding)
    indices.append(key)

# Convert to a numpy array
averaged_embeddings_np = np.array(averaged_embeddings)

# Apply t-SNE
tsne = TSNE(n_components=2, random_state=42)
tsne_results = tsne.fit_transform(averaged_embeddings_np)

# Visualization
plt.figure(figsize=(10, 6))
plt.scatter(tsne_results[:, 0], tsne_results[:, 1])
plt.title("t-SNE Visualization of Embeddings")
plt.xlabel("TSNE-1")
plt.ylabel("TSNE-2")
plt.show()


In [None]:
# Load the data
prott5_embs = h5py.File("data/disorder/embeddings/unfiltered_all_prott5.h5", "r")

# Averaging the embeddings
averaged_embeddings = []
indices = []  # To keep track of the indices

for key in prott5_embs.keys():
    if key not in strict_ids:
        continue
    embedding = np.array(prott5_embs[key])
    averaged_embedding = np.mean(embedding, axis=0)  # Average along the amino acids
    averaged_embeddings.append(averaged_embedding)
    indices.append(key)

# Convert to a numpy array
averaged_embeddings_np = np.array(averaged_embeddings)

# Apply t-SNE
tsne = TSNE(n_components=2, random_state=42)
tsne_results = tsne.fit_transform(averaged_embeddings_np)

# Visualization
plt.figure(figsize=(10, 6))
plt.scatter(tsne_results[:, 0], tsne_results[:, 1])
plt.title("t-SNE Visualization of Embeddings")
plt.xlabel("TSNE-1")
plt.ylabel("TSNE-2")
plt.show()

In [None]:
strict['10119_1_1_1']

In [None]:
print_prot()

In [4]:
esm2 = h5py.File("data/disorder/embeddings/unfiltered_all_esm2_3b.h5", "r")

In [6]:
esm2['10006_1_1_1']

<HDF5 dataset "10006_1_1_1": shape (2560,), type "<f4">

In [None]:
esm2_res = h5py.File("data/disorder/embeddings/unfiltered_all_esm2_3b_res.h5", "r")

In [None]:
esm2_res['10006_1_1_1']