In [1]:
import os

In [None]:
print(os.environ.get('HF_HUB_CACHE'))
os.environ['HF_HUB_CACHE'] = '~/my_huggingface_cache'
print(os.environ.get('HF_HUB_CACHE'))

/cache/huggingface/hub
~/my_huggingface_cache


In [3]:
import transformers
import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer
from transformers import GPT2LMHeadModel

2025-03-24 20:43:33.104657: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742874213.120684  393938 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742874213.125528  393938 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-24 20:43:33.144785: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained('nferruz/ProtGPT2', cache_dir="/home/shai/my_huggingface_cache")
model = GPT2LMHeadModel.from_pretrained('nferruz/ProtGPT2', cache_dir="/home/shai/my_huggingface_cache").to(device) 

In [19]:
def protgpt_wrapper(samples):
    ppls = []
    for seq in samples:
        out = tokenizer(seq, return_tensors="pt")
        input_ids = out.input_ids.cuda()

        with torch.no_grad():
            outputs = model(input_ids, labels=input_ids)

        ppl = (outputs.loss * input_ids.shape[1]).item()
        ppls.append(ppl)
    
    ppls = np.array(ppls)
    return ppls

def extract_ll_distr(df, seq_label):
    sequences = df[seq_label]
    return -1 * protgpt_wrapper(sequences)

def extract_ll_directory(dir_name, seq_label):
    for fn in os.listdir(dir_name):
        file_path = os.path.join(dir_name, fn)
        if fn.lower().endswith(".csv"):
            try:
                df = pd.read_csv(file_path, nrows=0)  # Read only the header
                if 'loglikelihood' in df.columns: # Don't re-compute if already has loglikelihood
                    print(f"{fn} already processed - Skipping...")
                elif seq_label in df.columns:
                    df = pd.read_csv(file_path)
                    ll_distr = extract_ll_distr(df, seq_label)
                    df['loglikelihood'] = ll_distr
                    df.to_csv(file_path, index=False)
            except Exception as e: 
                pass

In [23]:
extract_ll_directory('data/baseline_data/distribution', 'seq')

original_new_10_0.5_0_results_merge.csv already processed - Skipping...
original_old_10_0.5_0_results_merge.csv already processed - Skipping...


In [24]:
extract_ll_directory('data/beam_data/distribution', 'seq')

original_old_10_0.5_0_results_merge_old_7JJK_beam_10_1.csv already processed - Skipping...
original_new_10_0.5_0_results_merge_new_7JJK_scrmsd_beam_5_5.csv already processed - Skipping...
original_new_10_0.5_0_results_merge_new_7JJK_scrmsd_beam_5_10.csv already processed - Skipping...
original_new_10_0.5_0_results_merge_beam_10_1.csv already processed - Skipping...


In [25]:
extract_ll_directory('data/bon_data/distribution', 'seq')

original_new_10_0.5_0_results_merge_bon_10.csv already processed - Skipping...
original_old_10_0.5_0_results_merge_old_7JJK_bon_10.csv already processed - Skipping...
original_new_10_0.5_0_results_merge_new_7JJK_scrmsd_bon_5.csv already processed - Skipping...
