In [9]:
# Step 1: Importing the necessary Library
import hashlib
import random
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

In [12]:
# Step2: Mouting the Drive for Dataset accessing
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [22]:
# Step 3: Loading the datasets
ground_truth = pd.read_csv('/content/drive/MyDrive/zenodo_directory/zenodo_directory/data/benchmark_datasets/opentargets_step2.labels', sep='\t')
gene_embedding = pd.read_csv('/content/drive/MyDrive/zenodo_directory/zenodo_directory/data/helper_datasets/gene_embeddings.csv')
phenotype = pd.read_csv('/content/drive/MyDrive/zenodo_directory/zenodo_directory/data/benchmark_datasets/opentargets_step2.for_llm.tsv', sep='\t')
phenotype_embedding = pd.read_csv('/content/drive/MyDrive/zenodo_directory/zenodo_directory/data/helper_datasets/phenotype_embeddings.csv')

# Step 4: Add row_number to the three specific dataframes
ground_truth['row_number'] = ground_truth.index
gene_embedding['row_number'] = gene_embedding.index
phenotype_embedding['row_number'] = phenotype_embedding.index

# Print the column names of each DataFrame
print("Ground Truth Columns:\n", ground_truth.columns)
print("Gene Embeddings Columns:\n", gene_embedding.columns)
print("Phenotype Embeddings Columns:\n", phenotype_embedding.columns)
print("Phenotype Gene Columns:\n", phenotype.columns)

Ground Truth Columns:
 Index(['symbol', 'gene', 'row_number'], dtype='object')
Gene Embeddings Columns:
 Index(['Unnamed: 0', '0', 'gpt_description', 'embedding', 'row_number'], dtype='object')
Phenotype Embeddings Columns:
 Index(['Unnamed: 0', '0', 'gpt_description', 'embedding', 'row_number'], dtype='object')
Phenotype Gene Columns:
 Index(['row_number', 'description', 'symbol_gene_string',
       'ensembl_gene_string'],
      dtype='object')


In [18]:
# Step 5: Merge the phenotype gene data with ground truth labels on 'row_number'
merged_df = phenotype.merge(ground_truth, on='row_number', how='left')

# Step 6: Merging with gene embeddings and phenotype embedding
merged_df = merged_df.merge(gene_embedding[['row_number', 'embedding']], on='row_number', how='left', suffixes=('', '_gene'))

merged_df = merged_df.merge(phenotype_embedding[['row_number', 'embedding']], on='row_number', how='left', suffixes=('', '_phenotype'))

# Step 7: Clean and prepare the embeddings columns
def process_embedding(embedding_str):
    if isinstance(embedding_str, str):
        return np.array([float(i) for i in embedding_str.strip('[]').split(',')])
    return np.nan

# Apply processing to the correct columns
merged_df['embedding_gene'] = merged_df['embedding'].apply(process_embedding)
merged_df['embedding_phenotype'] = merged_df['embedding'].apply(process_embedding)

# Step 8: Cosine similarity of Phenotype and Gene embeddings
merged_df = merged_df.dropna(subset=['embedding_gene', 'embedding_phenotype'])
merged_df['cosine_similarity'] = merged_df.apply(
    lambda row: cosine_similarity(row['embedding_gene'].reshape(1, -1), row['embedding_phenotype'].reshape(1, -1))[0][0],
    axis=1
)

# Step 9: Dataframe to include necessary columns
result_df = merged_df[['row_number', 'description', 'symbol', 'cosine_similarity']]

In [19]:
# Step 10: Hashing name for unique seed
name = "Saurya Kumar Gupta"
hashed_name = hashlib.sha256(name.replace(" ", "").lower().encode()).hexdigest()
seed = int(hashed_name, 16) % (2**32)

# Step 11: Sampling 500 phenotypes using the seed that generated by name using SHA256
np.random.seed(seed)
sample = result_df.sample(n=500, random_state=seed)

In [20]:
# Step 12: Saving unique dataset to a new file called sample
sample.to_csv('/content/drive/MyDrive/zenodo_directory/zenodo_directory/data/benchmark_datasets/sample.xlsx', index=False)
print("Unique dataset created and saved as 'sample.xlsx'.")

# Printing the hash value
print(f"Hash value of dataset seed is: {hashed_name}")

Unique dataset created and saved as 'unique_phenotype_sample.csv'.
Hash value of dataset seed is: 41021185ee9be2c95fe2de45f0614dd161a3c4d7eb10f7a1892678f3d6fdc30f
