In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install -U kaleido hdbscan umap-learn plotly



In [None]:
# Imports
import h5py
import pandas as pd
import numpy as np
from sklearn.preprocessing import normalize
from umap import UMAP
import hdbscan
import plotly.express as px
import plotly.graph_objs as go

In [None]:
# Load embeddings from H5 file
h5_file_path = '/content/drive/MyDrive/gobi/deneme/umap.h5'

embeddings_list = []
names_list = []

with h5py.File(h5_file_path, 'r') as h5_file:
    for name in h5_file:
        current_embedding = h5_file[name][:]
        embeddings_list.append(current_embedding)
        names_list.append(name)


In [None]:
# Taksonomik grupları eşleştir
group_map = {
    'Apis_mellifera': 'Hymenoptera (bee)',
    'Bombyx_mandarina': 'Lepidoptera',
    'Plutella_xylostella': 'Lepidoptera',
    'Galleria_mellonella': 'Lepidoptera',
    'Achroia_grisella': 'Lepidoptera',
    'Manduca_sexta': 'Lepidoptera',
    'Cydia_amplana': 'Lepidoptera',
    'Vanessa_cardui': 'Lepidoptera (butterfly)',
    'Maniola_jurtina': 'Lepidoptera (butterfly)',
    'Bombus_impatiens': 'Hymenoptera (ant)',
    'Monomorium_pharaonis': 'Hymenoptera (ant)',
    'Leptopilina_boulardi': 'Hymenoptera (wasp)',
    'Vespula_vulgaris': 'Hymenoptera (wasp)',
    'Cotesia_glomerata': 'Hymenoptera (wasp)',
    'Nilaparvata_lugens': 'Hemiptera',
    'Homalodisca_vitripennis': 'Hemiptera',
    'Cimex_lectularius': 'Hemiptera (bug)',
    'Halyomorpha_halys': 'Hemiptera (bug)',
    'Ischnura_elegans': 'Outgroup',
    'Planococcus_citri': 'Outgroup',
    'Bombyx mori': 'Lepidoptera'


}
# label sütununda tür adları zaten var
umap_df_2d['Group'] = umap_df_2d['label'].map(group_map)
import plotly.express as px

fig = px.scatter(
    umap_df_2d,
    x='UMAP_1',
    y='UMAP_2',
    color='Group',
    text='label',  # Nokta üstüne tür adını yaz
    title='🧬 UMAP of FibH Embeddings Colored by Taxonomic Group',
    width=1000,
    height=700
)

fig.update_traces(textposition='top center')
fig.update_layout(
    font=dict(size=13),
    title_x=0.5,
    plot_bgcolor='rgba(240,240,240,0.95)'
)

fig.show()



In [None]:
# Create DataFrame
embeddings_df = pd.DataFrame(embeddings_list)
embeddings_df['name'] = names_list

# Normalize embeddings for cosine UMAP
embeddings = embeddings_df.drop('name', axis=1).values
normalized_embeddings = normalize(embeddings, norm='l2')

In [None]:
# UMAP (2D + 3D) with cosine metric
umap_2d = UMAP(n_components=2, n_neighbors=40, min_dist=0.1, metric='cosine', random_state=42)
umap_coords_2d = umap_2d.fit_transform(normalized_embeddings)

umap_3d = UMAP(n_components=3, n_neighbors=40, min_dist=0.1, metric='cosine', random_state=42)
umap_coords_3d = umap_3d.fit_transform(normalized_embeddings)


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


n_neighbors is larger than the dataset size; truncating to X.shape[0] - 1


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


n_neighbors is larger than the dataset size; truncating to X.shape[0] - 1



In [None]:
# Create DataFrames from UMAP coordinates
umap_df_2d = pd.DataFrame(umap_coords_2d, columns=['UMAP_1', 'UMAP_2'])
umap_df_3d = pd.DataFrame(umap_coords_3d, columns=['UMAP_1', 'UMAP_2', 'UMAP_3'])

# Add original names
umap_df_2d['name'] = embeddings_df['name']
umap_df_3d['name'] = embeddings_df['name']

# Safe splitting functions
def extract_label(name):
    parts = name.split('|')
    return parts[0] if len(parts) > 0 else 'Unknown'

def extract_gene(name):
    parts = name.split('|')
    return parts[1] if len(parts) > 1 else 'Unknown'

# Apply safe split
umap_df_2d['label'] = umap_df_2d['name'].apply(extract_label)
umap_df_2d['Gene'] = umap_df_2d['name'].apply(extract_gene)
umap_df_3d['label'] = umap_df_3d['name'].apply(extract_label)
umap_df_3d['Gene'] = umap_df_3d['name'].apply(extract_gene)

# Clustering with HDBSCAN
clusterer = hdbscan.HDBSCAN(min_cluster_size=3, metric='euclidean')
clusters = clusterer.fit_predict(umap_coords_2d)
umap_df_2d['cluster'] = clusters.astype(str)


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.



In [None]:
from umap import UMAP
from sklearn.preprocessing import normalize
import pandas as pd
import plotly.express as px

# Normalize embeddings (for cosine similarity)
embeddings = embeddings_df.drop('name', axis=1).values
normalized_embeddings = normalize(embeddings, norm='l2')

# UMAP with tighter clusters
umap_2d = UMAP(
    n_components=2,
    n_neighbors=20,
    min_dist=0.01,
    metric='cosine',
    init='spectral',
    random_state=42
)
umap_coords_2d = umap_2d.fit_transform(normalized_embeddings)

# Build DataFrame
umap_df_2d = pd.DataFrame(umap_coords_2d, columns=['UMAP1', 'UMAP2'])
umap_df_2d['name'] = embeddings_df['name']

# Safe extract functions
def safe_extract_organism(x):
    parts = x.split('|')
    return parts[0] if len(parts) > 0 else 'Unknown'

def safe_extract_gene(x):
    parts = x.split('|')
    return parts[1] if len(parts) > 1 else 'Unknown'

# Apply safe parsing
umap_df_2d['Organism'] = umap_df_2d['name'].apply(safe_extract_organism)
umap_df_2d['Gene'] = umap_df_2d['name'].apply(safe_extract_gene)
umap_df_2d['Label'] = umap_df_2d['Organism'] + ' | ' + umap_df_2d['Gene']

# Interactive Plot with Plotly
fig = px.scatter(
    umap_df_2d,
    x='UMAP1',
    y='UMAP2',
    color='Label',
    hover_data=['Organism', 'Gene', 'name'],
    title='2D UMAP of Protein Embeddings (Interactive)',
    width=1000,
    height=600
)
fig.update_traces(marker=dict(size=8, line=dict(width=1, color='DarkSlateGrey')))
fig.update_layout(legend_title='Organism | Gene')
fig.show()


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [None]:
from umap import UMAP
from sklearn.preprocessing import normalize
from sklearn.cluster import KMeans
import pandas as pd
import plotly.express as px

# Normalize embeddings (for cosine similarity)
embeddings = embeddings_df.drop('name', axis=1).values
normalized_embeddings = normalize(embeddings, norm='l2')

# UMAP (2D)
umap_2d = UMAP(
    n_components=2,
    n_neighbors=20,
    min_dist=0.01,
    metric='cosine',
    init='spectral',
    random_state=42
)
umap_coords_2d = umap_2d.fit_transform(normalized_embeddings)

# Build DataFrame
umap_df_2d = pd.DataFrame(umap_coords_2d, columns=['UMAP1', 'UMAP2'])
umap_df_2d['name'] = embeddings_df['name']
umap_df_2d['Organism'] = umap_df_2d['name'].apply(lambda x: x.split('|')[0])
umap_df_2d['Gene'] = umap_df_2d['name'].apply(lambda x: x.split('|')[1])
umap_df_2d['Label'] = umap_df_2d['Organism'] + ' | ' + umap_df_2d['Gene']

# --- KMeans clustering ---
n_clusters = 3  # You can change this
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
umap_df_2d['Cluster'] = kmeans.fit_predict(umap_df_2d[['UMAP1', 'UMAP2']])

# --- Plot with clusters ---
fig = px.scatter(
    umap_df_2d,
    x='UMAP1',
    y='UMAP2',
    color=umap_df_2d['Cluster'].astype(str),  # Color by cluster ID
    hover_data=['Organism', 'Gene', 'name', 'Cluster'],
    title=f'2D UMAP + KMeans Clustering (k={n_clusters})',
    width=1200,
    height=800
)
fig.update_traces(marker=dict(size=8, line=dict(width=1, color='DarkSlateGrey')))
fig.update_layout(legend_title='Cluster ID')
fig.show()



'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [None]:
from umap import UMAP
from sklearn.preprocessing import normalize
import pandas as pd
import plotly.express as px

# Normalize embeddings (for cosine similarity)
embeddings = embeddings_df.drop('name', axis=1).values
normalized_embeddings = normalize(embeddings, norm='l2')

# 3D UMAP
umap_3d = UMAP(
    n_components=3,
    n_neighbors=20,
    min_dist=0.01,
    metric='cosine',
    init='spectral',
    random_state=42
)
umap_coords_3d = umap_3d.fit_transform(normalized_embeddings)

# Build DataFrame
umap_df_3d = pd.DataFrame(umap_coords_3d, columns=['UMAP1', 'UMAP2', 'UMAP3'])
umap_df_3d['name'] = embeddings_df['name']
umap_df_3d['Organism'] = umap_df_3d['name'].apply(lambda x: x.split('|')[0])
umap_df_3d['Gene'] = umap_df_3d['name'].apply(lambda x: x.split('|')[1])
umap_df_3d['Label'] = umap_df_3d['Organism'] + ' | ' + umap_df_3d['Gene']

# 3D interactive plot with Plotly
fig = px.scatter_3d(
    umap_df_3d,
    x='UMAP1',
    y='UMAP2',
    z='UMAP3',
    color='Label',
    hover_data=['Organism', 'Gene', 'name'],
    title='3D UMAP of Protein Embeddings (Interactive)',
    width=1000,
    height=700
)
fig.update_traces(marker=dict(size=4, line=dict(width=0.5, color='DarkSlateGrey')))
fig.update_layout(legend_title='Organism | Gene')
fig.show()



'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [None]:
fig_3d = px.scatter_3d(
    umap_df_3d,
    x='UMAP_1',
    y='UMAP_2',
    z='UMAP_3',
    color='label',         # Organism = color
    symbol='Gene',         # Gene = shape
    hover_data=['name', 'label', 'Gene'],
    title='3D UMAP of Protein Embeddings (WIDE View)'
)

# Make markers larger and add borders
fig_3d.update_traces(marker=dict(size=5, line=dict(width=1, color='black')))

# Wider, larger 3D plot
fig_3d.update_layout(
    width=1200,
    height=800,
    legend_title_text='Organism',
    margin=dict(l=0, r=0, b=0, t=40),
    title_x=0.5,
    scene=dict(
        xaxis=dict(showticklabels=False, title=''),
        yaxis=dict(showticklabels=False, title=''),
        zaxis=dict(showticklabels=False, title=''),
        camera=dict(
            eye=dict(x=1.2, y=2.2, z=0.8)  # Adjust camera for a nice angled view
        )
    )
)

fig_3d.show()