In [1]:
%load_ext autoreload
%autoreload 2

import csv
import json
import openai
import os
import pandas as pd
import uuid

import torch

from sentence_transformers import SentenceTransformer
from tqdm.notebook import tqdm_notebook
from transformers import AutoModel, AutoTokenizer

from tqdm.notebook import tqdm_notebook
tqdm_notebook().pandas()

pd.set_option('display.max_colwidth', None)

0it [00:00, ?it/s]

In [2]:
project_folder = "diygenomics-projects"
sub_category = "DATA"
work_bucket = "RSIDs"

input_file = 'Alzheimers RSIDs - consolidated 061823.csv'
output_file = 'truth_alzheimers_RSIDs_consolidated_061823.csv'

index_col = 'uuid'

openai_model = 'text-embedding-ada-002'

In [3]:
data_path = os.getenv('DATA_PATH')
file_path = lambda *args: os.path.join(data_path, project_folder, sub_category, work_bucket, *args)

In [4]:
base_df = pd.read_csv(file_path(input_file))

In [5]:
if os.path.exists(file_path(output_file)):
    df = pd.read_csv(file_path(output_file), index_col=index_col)
else:
    df = base_df
    df['uuid'] = [uuid.uuid4() for _ in range(len(df))]
    df.set_index('uuid', inplace=True)

In [6]:
len(df)

314

In [7]:
df = df[(~df['chr'].isna()) & (~df['gene'].isna()) & (~df['position'].isna()) & (~df['rsid'].isna())]

In [8]:
len(df)

276

In [9]:
def get_embedding(text):
    return model.encode(text).tolist()

In [10]:
def check_nan(value):
    if pd.isna(value):
        value = ''
    return value

def create_combined_datapoint_060323(row):
    value =  f"{row['chr']}|{row['position']}|{row['rsid']}|{check_nan(row['A1'])}|{check_nan(row['A2'])}"
    value += f"|{check_nan(row['AF-MAF'])}|{check_nan(row['OR-Z'])}|{check_nan(row['95% CI'])}"
    value += f"|{check_nan(row['stage'])}|{check_nan(row['protein'])}|{check_nan(row['probe'])}"
					
    return value

def create_combined_datapoint_061823(row):
    value =  f"{row['chr']}|{row['gene']}|{row['position']}|{row['rsid']}|{check_nan(row['AF-MAF'])}"
    value += f"|{check_nan(row['OR-Z'])}|{check_nan(row['95% CI'])}"
    value += f"|{check_nan(row['protein'])}|{check_nan(row['probe'])}"
					
    return value

In [11]:
df['combined_data'] = df.progress_apply(lambda row: create_combined_datapoint_061823(row), axis=1)

  0%|          | 0/276 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['combined_data'] = df.progress_apply(lambda row: create_combined_datapoint_061823(row), axis=1)


In [12]:
df['openai_combined_data_embeddings'] = df['combined_data'].progress_apply(lambda x: openai.Embedding.create(input=x, engine=openai_model, temperature=0,)['data'][0]['embedding'])

  0%|          | 0/276 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['openai_combined_data_embeddings'] = df['combined_data'].progress_apply(lambda x: openai.Embedding.create(input=x, engine=openai_model, temperature=0,)['data'][0]['embedding'])


In [13]:
model = SentenceTransformer('flax-sentence-embeddings/st-codesearch-distilroberta-base')
df['codesearch_combined_data_embeddings'] = df['combined_data'].apply(get_embedding)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['codesearch_combined_data_embeddings'] = df['combined_data'].apply(get_embedding)


In [14]:
model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')
df['multi_qa_combined_data_embeddings'] = df['combined_data'].apply(get_embedding)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['multi_qa_combined_data_embeddings'] = df['combined_data'].apply(get_embedding)


In [15]:
model_name = 'witiko/mathberta'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

Some weights of the model checkpoint at witiko/mathberta were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at witiko/mathberta and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
def get_model_embeddings(text):
    if len(text) > 514:
        text = text[:514]
    tokenized_text = tokenizer.tokenize(text)
    input_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
    input_ids = torch.tensor([input_ids])
    with torch.no_grad():
        outputs = model(input_ids)
        embeddings = outputs.last_hidden_state
    return embeddings.numpy().tolist()[0][0]

In [17]:
df['mathbert_combined_data_embeddings'] = df['combined_data'].apply(get_model_embeddings)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['mathbert_combined_data_embeddings'] = df['combined_data'].apply(get_model_embeddings)


In [29]:
def create_tooltip_061823(row):
    return f"Gene: {row['gene']}\nRSID: {row['rsid']}"

df['tooltip'] = df.progress_apply(lambda row: create_tooltip_061823(row), axis=1)

  0%|          | 0/276 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tooltip'] = df.progress_apply(lambda row: create_tooltip_061823(row), axis=1)


In [32]:
df.to_csv(file_path(output_file), quoting=csv.QUOTE_MINIMAL)

In [19]:
# sb.glue('status', 'completed')