# GPN-MSA Testing with Datasets

## Setup environment 

In [None]:
# only need once for GPN-MSA use
# !pip install git+https://github.com/songlab-cal/gpn.git

In [None]:
# !ldconfig /usr/lib64-nvidia

In [1]:
from gpn.data import GenomeMSA, Tokenizer
import gpn.model
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import torch
from transformers import AutoModel, AutoModelForMaskedLM



In [2]:
from datasets import load_dataset, disable_caching
from gpn.data import load_dataset_from_file_or_dir

In [3]:
dataset_prefix = "/expanse/lustre/projects/nji102/sgriesmer/gpn/Datasets/"

In [None]:
input_file =  dataset_prefix + "test.parquet"
dataset = load_dataset_from_file_or_dir(input_file, split="test", is_file=True)

## Create datasets

### Create Pathogenic dataset (only needed once)

In [None]:
dataset_pathogenic = dataset.filter(lambda v: v["source"]=="ClinVar" or (v["label"]=="Common" and "missense" in v["consequence"]))
dataset_pathogenic.shape

### Create Cancer dataset (only needed once)

In [None]:
dataset_cancer = dataset.filter(lambda v: v["source"]=="COSMIC" or (v["label"]=="Common" and "missense" in v["consequence"]))
dataset_cancer.shape

### Create Regulatory dataset (only needed once)

In [None]:
cs = ["5_prime_UTR", "upstream_gene", "intergenic", "3_prime_UTR", "non_coding_transcript_exon"]
dataset_regulatory = dataset.filter(lambda v: v["source"]=="OMIM" or (v["label"]=="Common" and "missense" not in v["consequence"] and any([c in v["consequence"] for c in cs])))
dataset_regulatory.shape

## Save datasets as parquet files

In [None]:
dataset_pathogenic_filename = dataset_prefix + "pathogenic.parquet"
dataset_pathogenic.to_parquet(dataset_pathogenic_filename)

In [None]:
dataset_cancer_filename = dataset_prefix + "cancer.parquet"
dataset_cancer.to_parquet(dataset_cancer_filename)

In [None]:
dataset_regulatory_filename = dataset_prefix + "regulatory.parquet"
dataset_regulatory.to_parquet(dataset_regulatory_filename)

# Test and Score with GPN-MSA

In [22]:
part = ""
dataset_pathogenic_filename = dataset_prefix + "PAT_dataset_XY-named-hg38" + part +".vcf"
dataset_pathogenic_df = pd.read_csv(dataset_pathogenic_filename, sep='\t')
dataset_pathogenic_filename_parquet = dataset_prefix + "PAT_dataset_XY-named-hg38.parquet"
dataset_pathogenic_df.to_parquet(dataset_pathogenic_filename_parquet)
dataset_pathogenic = load_dataset_from_file_or_dir(dataset_pathogenic_filename_parquet, split="test", is_file=True)
dataset_pathogenic.shape

(11436, 5)

In [23]:
dataset_pathogenic.features

{'chrom': Value(dtype='string', id=None),
 'pos': Value(dtype='int64', id=None),
 'name': Value(dtype='string', id=None),
 'ref': Value(dtype='string', id=None),
 'alt': Value(dtype='string', id=None)}

## Load Dataset to be Scored (if previously created)

In [None]:
#dataset_pathogenic_filename = dataset_prefix + "pathogenic.parquet"
#dataset_pathogenic = load_dataset_from_file_or_dir(dataset_pathogenic_filename, split="test", is_file=True)
#dataset_pathogenic.shape

## Load MSA data

In [26]:
msa_path = "zip:///::https://huggingface.co/datasets/songlab/multiz100way/resolve/main/89.zarr.zip"
genome_msa = GenomeMSA(msa_path)  # can take a minute or two

Loading MSA...
Loading MSA... Done


## Load inference model

In [27]:
from gpn.msa.vep import VEPInference

model_path = "songlab/gpn-msa-sapiens"
window_size = 128
vep_inf = VEPInference(model_path, genome_msa, window_size, disable_aux_features=False)

## Pathogenic Dataset

## Subset Dataset to be Scored

In [70]:
set_start = 1000
set_end = 1300
dataset_pathogenic_set = dataset_pathogenic.select(range(set_start, set_end))
df_pathogenic_set = pd.DataFrame(dataset_pathogenic_set)
df_pathogenic_set

Unnamed: 0,chrom,pos,name,ref,alt
0,10,79946452,SNP_PAT_01001,C,T
1,10,80272484,SNP_PAT_01002,A,G
2,10,80272575,SNP_PAT_01003,C,T
3,10,80272799,SNP_PAT_01004,T,A
4,10,80273714,SNP_PAT_01005,G,A
...,...,...,...,...,...
295,11,6610130,SNP_PAT_01296,C,T
296,11,6612918,SNP_PAT_01297,C,T
297,11,6612958,SNP_PAT_01298,G,A
298,11,6613428,SNP_PAT_01299,C,G


## Tokenize Dataset

In [71]:
dataset_pathogenic_set.set_transform(vep_inf.tokenize_function)

## Set Arguments for Testing

In [72]:
from transformers import Trainer, TrainingArguments

output_dir = "/expanse/lustre/projects/nji102/sgriesmer/gpn/output"

batch_size = 100
training_args = TrainingArguments(
  output_dir=output_dir,
  per_device_eval_batch_size=batch_size,
  dataloader_num_workers=0,
  remove_unused_columns=False,
#  torch_compile=True,
#  fp16=True,
)

In [73]:
trainer = Trainer(
    model=vep_inf.model,
    args=training_args
)

## Test and Score Dataset Subset

In [74]:
# for debugging purposes
!TORCH_LOGS="+dynamo"
!TORCHDYNAMO_VERBOSE=1

In [75]:
pred=trainer.predict(test_dataset=dataset_pathogenic_set).predictions

## Add Score to Dataframe

In [76]:
gpn_score = vep_inf.postprocess(pred)
df_pathogenic_set["gpn_score"] = gpn_score
df_pathogenic_set.head()

Unnamed: 0,chrom,pos,name,ref,alt,gpn_score
0,10,79946452,SNP_PAT_01001,C,T,-1.06565
1,10,80272484,SNP_PAT_01002,A,G,-1.708707
2,10,80272575,SNP_PAT_01003,C,T,-0.865141
3,10,80272799,SNP_PAT_01004,T,A,1.565545
4,10,80273714,SNP_PAT_01005,G,A,-0.195607


## Plot shows differentiation between Common and Pathogenic mutations

In [56]:
sns.histplot(data=df_pathogenic_set, x="gpn_score", hue="label")

ValueError: Could not interpret value `label` for parameter `hue`

## Save dataframe as CSV

In [None]:
output_file = output_dir + "/pathogenic_set_hg38_part_" + part + "_" + str(set_start) + "_" + str(set_end) + ".csv"
df_pathogenic_set.to_csv(output_file, index=False, sep=',')

## Load dataframe from CSV (if needed)

In [None]:
output_dir = "/expanse/lustre/projects/nji102/sgriesmer/gpn/output"

input_csv_file = output_dir + "/pathogenic_set_0_39652.csv"
df_pathogenic_set = pd.read_csv(input_csv_file, sep=',')

In [None]:
sns.histplot(data=df_pathogenic_set, x="gpn_score", hue="label")

## Calculate metrics

In [None]:
from sklearn.metrics import roc_auc_score, average_precision_score
AUROC = roc_auc_score(df_pathogenic_set.label=="Pathogenic", -df_pathogenic_set.gpn_score)
AUPRC = average_precision_score(df_pathogenic_set.label=="Pathogenic", -df_pathogenic_set.gpn_score)
AUROC, AUPRC

## Plot ROC curve

In [None]:
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(df_pathogenic_set.label=="Pathogenic", -df_pathogenic_set.gpn_score)

In [None]:
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='b', lw=2, label=f'ROC curve (AUC = {AUROC:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

## Cancer Dataset

## Load Dataset to be Scored (if previously created)

In [None]:
#dataset_cancer_filename = dataset_prefix + "cancer.parquet"
#dataset_cancer = load_dataset_from_file_or_dir(dataset_cancer_filename, split="test", is_file=True)
#dataset_cancer.shape

In [None]:
part = "apaaabaaaaadac"
dataset_cancer_filename = dataset_prefix + "CAN_dataset_XY-named-hg38-part-" + part +".vcf"
dataset_cancer = load_dataset_from_file_or_dir(dataset_cancer_filename, split="test", is_file=True)
dataset_cancer.shape

In [None]:
dataset_cancer.features

In [None]:
dataset_cancer[0]

## Load MSA data

In [5]:
msa_path = "zip:///::https://huggingface.co/datasets/songlab/multiz100way/resolve/main/89.zarr.zip"
genome_msa = GenomeMSA(msa_path)  # can take a minute or two

Loading MSA...
Loading MSA... Done


## Load inference model

In [None]:
from gpn.msa.vep import VEPInference

model_path = "songlab/gpn-msa-sapiens"
window_size = 128
vep_inf = VEPInference(model_path, genome_msa, window_size, disable_aux_features=False)

## Subset Dataset to be Scored

In [None]:
set_start = 0
set_end = 2
dataset_cancer_set = dataset_cancer.select(range(set_start, set_end))
df_cancer_set = pd.DataFrame(dataset_cancer_set)
df_cancer_set

## Tokenize Dataset

In [None]:
dataset_cancer_set.set_transform(vep_inf.tokenize_function)

## Set Arguments for Testing

In [None]:
from transformers import Trainer, TrainingArguments

output_dir = "/expanse/lustre/projects/nji102/sgriesmer/gpn/output"

batch_size = 100
training_args = TrainingArguments(
  output_dir=output_dir,
  per_device_eval_batch_size=batch_size,
  dataloader_num_workers=0,
  remove_unused_columns=False,
#  torch_compile=True,
#  fp16=True,
)

In [None]:
trainer = Trainer(
    model=vep_inf.model,
    args=training_args
)

## Test and Score Dataset Subset

In [None]:
# for debugging purposes
!TORCH_LOGS="+dynamo"
!TORCHDYNAMO_VERBOSE=1

In [None]:
pred=trainer.predict(test_dataset=dataset_cancer_set).predictions

## Add Score to Dataframe

In [None]:
gpn_score = vep_inf.postprocess(pred)
df_cancer_set["gpn_score"] = gpn_score
df_cancer_set

In [None]:
df_cancer_set.drop([1], axis=0, inplace=True)
df_cancer_set

## Plot shows differentiation between Common and COSMIC/Frequent mutations

In [None]:
#sns.histplot(data=df_cancer_set, x="gpn_score", hue="label")

## Save dataframe as CSV

In [None]:
output_file = output_dir + "/cancer_set_hg38_part_" + part + "_" + str(set_start) + "_" + str(set_end) + ".csv"
df_cancer_set.to_csv(output_file, index=False, sep=',')

## Calculate metrics

In [None]:
df_cancer_set[17585:17587]

In [None]:
from sklearn.metrics import roc_auc_score, average_precision_score
AUROC = roc_auc_score(df_cancer_set.label=="Frequent", -gpn_score)
AUPRC = average_precision_score(df_cancer_set.label=="Frequent", -gpn_score)
AUROC, AUPRC

## Plot ROC curve

In [None]:
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(df_cancer_set.label=="Frequent", -gpn_score)

In [None]:
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='b', lw=2, label=f'ROC curve (AUC = {AUROC:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

In [None]:
## Regulatory Dataset

## Load MSA data

In [None]:
msa_path = "zip:///::https://huggingface.co/datasets/songlab/multiz100way/resolve/main/89.zarr.zip"
genome_msa = GenomeMSA(msa_path)  # can take a minute or two

## Load inference model

In [None]:
from gpn.msa.vep import VEPInference

model_path = "songlab/gpn-msa-sapiens"
window_size = 128
vep_inf = VEPInference(model_path, genome_msa, window_size, disable_aux_features=False)

## Subset Dataset to be Scored

In [None]:
output_dir = "/expanse/lustre/projects/nji102/sgriesmer/gpn/output"
results_file = output_dir + "/100-REG-results.csv"

f = open(results_file, 'a') 

for part in [ 
  "adafabad"
  ]:

  dataset_regulatory_filename = dataset_prefix + "REG_dataset_XY-named-trunc-hg38-part-" + part +".vcf"
  dataset_regulatory = load_dataset_from_file_or_dir(dataset_regulatory_filename, split="test", is_file=True)

  set_start = 0
  set_end = len(dataset_regulatory)
  dataset_regulatory_set = dataset_regulatory.select(range(set_start, set_end))
  df_regulatory_set = pd.DataFrame(dataset_regulatory_set)
  df_regulatory_set

# tokenize dataset

  dataset_regulatory_set.set_transform(vep_inf.tokenize_function)

# set arguments for testing

  from transformers import Trainer, TrainingArguments

  output_dir = "/expanse/lustre/projects/nji102/sgriesmer/gpn/output"

  batch_size = 500
  training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_eval_batch_size=batch_size,
    dataloader_num_workers=0,
    remove_unused_columns=False,
#   torch_compile=True,
#   fp16=True,
)

  trainer = Trainer(
    model=vep_inf.model,
    args=training_args
  )

# Test and score dataset subset

# for debugging purposes
  !TORCH_LOGS="+dynamo"
  !TORCHDYNAMO_VERBOSE=1

  try:
      pred=trainer.predict(test_dataset=dataset_regulatory_set).predictions
      print(part, " - yes")
      #f.write(part + ",yes")
      
      gpn_score = vep_inf.postprocess(pred)
      df_regulatory_set["gpn_score"] = gpn_score
        
      # drop stub if down to 1-result
        
      #df_regulatory_set.drop([1], axis=0, inplace=True)
    
      output_file = output_dir + "/regulatory_set_hg38_part_" + part + "_" + str(set_start) + "_" + str(set_end) + ".csv"
      df_regulatory_set.to_csv(output_file, index=False, sep=',')   
  except:
      print(part + " - no")  
      #f.write(part + ",no")
      continue

#gpn_score = vep_inf.postprocess(pred)
#df_regulatory_set["gpn_score"] = gpn_score
#df_regulatory_set.head()

# store output

#output_file = output_dir + "/regulatory_set_hg38_part_" + "_" + str(set_start) + "_" + str(set_end) + ".csv"
#df_regulatory_set.to_csv(output_file, index=False, sep=',')


## Plot shows differentiation between Common and OMIM/Pathogenic mutations

In [None]:
sns.histplot(data=df_regulatory_set, x="gpn_score", hue="label")

## Save dataframe as CSV

In [None]:
output_file = output_dir + "/regulatory_set" + "_" + str(set_start) + "_" + str(set_end) + ".csv"
df_regulatory_set.to_csv(output_file, index=False, sep=',')

## Load dataframe from CSV (if needed)

In [None]:
output_dir = "/expanse/lustre/projects/nji102/sgriesmer/gpn/output"

input_csv_file = [output_dir + "/regulatory_set_0_100000.csv", 
output_dir + "/regulatory_set_100000_200000.csv",
output_dir + "/regulatory_set_200000_300000.csv",
output_dir + "/regulatory_set_300000_400000.csv",
output_dir + "/regulatory_set_400000_600000.csv",
output_dir + "/regulatory_set_600000_800000.csv"]

df_regulatory_subset = []
for f in input_csv_file:
    df_regulatory_subset.append(pd.read_csv(f, sep=','))
    
df_regulatory_set = pd.concat(df_regulatory_subset, axis=0)

In [None]:
df_regulatory_set

In [None]:
sns.histplot(data=df_regulatory_set, x="gpn_score", hue="label")

## Calculate metrics

In [None]:
df_regulatory_set

In [None]:
from sklearn.metrics import roc_auc_score, average_precision_score
AUROC = roc_auc_score(df_regulatory_set.label=="Pathogenic", -df_regulatory_set.gpn_score)
AUPRC = average_precision_score(df_regulatory_set.label=="Pathogenic", -df_regulatory_set.gpn_score)
AUROC, AUPRC

## Plot ROC curve

In [None]:
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(df_regulatory_set.label=="Pathogenic", -df_regulatory_set.gpn_score)

In [None]:
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='b', lw=2, label=f'ROC curve (AUC = {AUROC:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

## CAD Datasets

### Load GPN-MSA Model

In [4]:
msa_path = "zip:///::https://huggingface.co/datasets/songlab/multiz100way/resolve/main/89.zarr.zip"
genome_msa = GenomeMSA(msa_path)  # can take a minute or two

Loading MSA...
Loading MSA... Done


### Load Inference Model

In [5]:
from gpn.msa.vep import VEPInference

model_path = "songlab/gpn-msa-sapiens"
window_size = 128
vep_inf = VEPInference(model_path, genome_msa, window_size, disable_aux_features=False)

### Subset Dataset to be Scored

In [9]:
# keep track of successful and unsuccessful scores in results file

output_dir = "/expanse/lustre/projects/nji102/sgriesmer/gpn/output"
results_file = output_dir + "/Brown_hg38_gpn_1-ahajah-results.csv"

f = open(results_file, 'a') 

# create parts list

import sre_yield

parts = []
#parts = list(sre_yield.AllStrings(r'[ab]'))
parts = [
  "ahajahae",
  "ahajahaf",
  "ahajahag",
  "ahajahah",
  "ahajahai",
  "ahajahaj"
]
parts.sort()

# parts processed so far

parts_left = parts.copy()

for part in parts:
    
# keep track of parts left to process in case of error
    
  parts_left.remove(part)

  dataset_cad_filename = dataset_prefix + "Brown_eQTL_dataset_XY-named-hg38-part-" + part +".vcf"
  dataset_cad_df = pd.read_csv(dataset_cad_filename, sep='\t')
  print(dataset_cad_df)
  dataset_cad_filename_parquet = dataset_prefix + "Brown_eQTL_dataset_XY-named-hg38-part-" + part +".parquet"
  dataset_cad_df.to_parquet(dataset_cad_filename_parquet)
  dataset_cad = load_dataset_from_file_or_dir(dataset_cad_filename_parquet, split="test", is_file=True)
 


# create dataset and dataframe

  set_start = 0
  set_end = len(dataset_cad)
  dataset_cad_set = dataset_cad.select(range(set_start, set_end))
  df_cad_set = pd.DataFrame(dataset_cad_set)
  print(df_cad_set)

# tokenize dataset

  dataset_cad_set.set_transform(vep_inf.tokenize_function)

# set arguments for testing

  from transformers import Trainer, TrainingArguments

  batch_size = 500
  training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_eval_batch_size=batch_size,
    dataloader_num_workers=0,
    remove_unused_columns=False,
#   torch_compile=True,
#   fp16=True,
)

  trainer = Trainer(
    model=vep_inf.model,
    args=training_args
  )

# Test and score dataset subset

# for debugging purposes
  !TORCH_LOGS="+dynamo"
  !TORCHDYNAMO_VERBOSE=1

  try:
    pred=trainer.predict(test_dataset=dataset_cad_set).predictions

      
    gpn_score = vep_inf.postprocess(pred)
    df_cad_set["gpn_score"] = gpn_score
    print(df_cad_set["gpn_score"])
        
# drop stub if down to 1-result
        
    #df_cad_set.drop([1], axis=0, inplace=True)

    
# Write success to terminal and results file
    
    print(part + ',' + str(df_cad_set["gpn_score"][0]))
    f.write(part + ',' + str(df_cad_set["gpn_score"][0]) + "\n")
    
    
# write score output to file
    
    #output_file = output_dir + "/REG_dataset_XY-named-hg38-part_" + part + "_" + str(set_start) + "_" + 1 + ".csv"
    output_file = output_dir + "/Brown_eQTL_dataset_XY-named-hg38-part_" + part + "_" + str(set_start) + "_" + str(set_end) + ".csv"
    df_cad_set.to_csv(output_file, index=False, sep=',')   
  
  except:
        
# Write failure to terminal and results file
        
    print(part + ",no score") 
    f.write(part + ",no score" + "\n")
    continue
    
# Close results file
    
f.close()


   chrom        pos      name ref alt
0      2  110122469  BWN08000   G   A
1      1  101158974  SNP00201   G   A
  chrom        pos      name ref alt
0     2  110122469  BWN08000   G   A
1     1  101158974  SNP00201   G   A


0    0.433602
1    1.683076
Name: gpn_score, dtype: float32
ahajahae,0.43360215
   chrom        pos      name ref alt
0      2  110126152  BWN08001   G   A
1      1  101158974  SNP00201   G   A
  chrom        pos      name ref alt
0     2  110126152  BWN08001   G   A
1     1  101158974  SNP00201   G   A


0    4.035152
1    1.683076
Name: gpn_score, dtype: float32
ahajahaf,4.0351524
   chrom        pos      name ref alt
0      2  110127050  BWN08002   C   T
1      1  101158974  SNP00201   G   A
  chrom        pos      name ref alt
0     2  110127050  BWN08002   C   T
1     1  101158974  SNP00201   G   A


0   -1.740111
1    1.683076
Name: gpn_score, dtype: float32
ahajahag,-1.7401114
   chrom        pos      name ref alt
0      2  110128565  BWN08003   G   A
1      1  101158974  SNP00201   G   A
  chrom        pos      name ref alt
0     2  110128565  BWN08003   G   A
1     1  101158974  SNP00201   G   A


0    1.347963
1    1.683076
Name: gpn_score, dtype: float32
ahajahah,1.3479633
   chrom        pos      name ref alt
0      2  110135346  BWN08004   A   G
1      1  101158974  SNP00201   G   A
  chrom        pos      name ref alt
0     2  110135346  BWN08004   A   G
1     1  101158974  SNP00201   G   A


0   -0.272301
1    1.683076
Name: gpn_score, dtype: float32
ahajahai,-0.27230084
   chrom        pos      name ref alt
0      2  110136202  BWN08005   G   C
1      1  101158974  SNP00201   G   A
  chrom        pos      name ref alt
0     2  110136202  BWN08005   G   C
1     1  101158974  SNP00201   G   A


0    1.903942
1    1.683076
Name: gpn_score, dtype: float32
ahajahaj,1.9039416


Load CAD datasets for P < 0.01 and P > 0.5

In [20]:
output_dir = "/expanse/lustre/projects/nji102/sgriesmer/gpn/output"

cad_gt5_csv_file = output_dir + "/cad-gt5-results.csv"
cad_lt01_csv_file = output_dir + "/cad-lt01-results.csv"
df_cad_set_gt5 = pd.read_csv(cad_gt5_csv_file, sep=',')
df_cad_set_lt01 = pd.read_csv(cad_lt01_csv_file, sep=',')

In [21]:
df_cad_set_gt5

Unnamed: 0,file suffix,gpn_score
0,aaaaaaaa,no score
1,aaaaaaab,0.89399433
2,aaaaaaac,1.8475147
3,aaaaaaad,-2.8691573
4,aaaaaaae,-1.3439262
...,...,...
9995,ajajajaf,-1.5116944
9996,ajajajag,-1.8452871
9997,ajajajah,no score
9998,ajajajai,no score


In [22]:
df_cad_set_lt01

Unnamed: 0,file suffix,gpn_score
0,aaaaaaab,0.57608455
1,aaaaaaac,-0.36128122
2,aaaaaaad,no score
3,aaaaaaae,no score
4,aaaaaaaf,no score
...,...,...
9994,ajajajaf,no score
9995,ajajajag,no score
9996,ajajajah,1.2931308
9997,ajajajai,no score


### Drop rows with "No score"

In [17]:
df_cad_set_lt01 = df_cad_set_lt01[df_cad_set_lt01["gpn_score"].str.contains("no score") == False]
df_cad_set_lt01

Unnamed: 0,file suffix,gpn_score
0,aaaaaaab,0.57608455
1,aaaaaaac,-0.36128122
10,aaaaabab,-0.37237144
11,aaaaabac,-0.8319971
17,aaaaabai,-1.4604263
...,...,...
9991,ajajajac,-3.1450398
9992,ajajajad,0.41094077
9993,ajajajae,0.07117581
9996,ajajajah,1.2931308


In [19]:
df_cad_set_gt5 = df_cad_set_gt5[df_cad_set_gt5["gpn_score"].str.contains("no score") == False]
df_cad_set_gt5

Unnamed: 0,file suffix,gpn_score
1,aaaaaaac,-0.36128122
10,aaaaabab,-0.37237144
11,aaaaabac,-0.8319971
17,aaaaabai,-1.4604263
20,aaaaacab,0.10453239
...,...,...
9974,ajajahaf,-0.98141
9977,ajajahai,-1.5508695
9987,ajajaiai,-0.33894324
9991,ajajajac,-3.1450398


In [None]:
part = "afajajag"#  dataset_regulatory_filename = dataset_prefix + "REG_dataset_XY-named-trunc-hg38-part-" + part +".vcf"
dataset_cad_filename = dataset_prefix + "random_sampling_lt01-hg38-part-" + part +".vcf"
dataset_cad = load_dataset_from_file_or_dir(dataset_cad_filename, split="test", is_file=True)
dataset_cad

In [None]:
set_start = 0
#set_end = len(dataset_cad)
set_end = 2
dataset_cad_set = dataset_cad.select(range(set_start, set_end))
df_cad_set = pd.DataFrame(dataset_cad_set)
df_cad_set, dataset_cad_set

In [None]:
dataset_cad_set.set_transform(vep_inf.tokenize_function)
dataset_cad_set.features

In [None]:
from transformers import Trainer, TrainingArguments

output_dir = "/expanse/lustre/projects/nji102/sgriesmer/gpn/output"

batch_size = 500
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_eval_batch_size=batch_size,
    dataloader_num_workers=0,
    remove_unused_columns=False,
#   torch_compile=True,
#   fp16=True,
)


In [None]:
trainer = Trainer(
    model=vep_inf.model,
    args=training_args
)

In [None]:
pred=trainer.predict(test_dataset=dataset_cad_set).predictions

In [None]:
gpn_score = vep_inf.postprocess(pred)
df_cad_set["gpn_score"] = gpn_score

In [None]:
df_cad_set

### Cut out stub from set

In [None]:
df_cad_set.drop([1], axis=0, inplace=True)
df_cad_set

In [None]:
print(part + ',' + str(df_cad_set["gpn_score"][0]))

In [None]:
output_file = output_dir + "/cad_set_hg38_part_" + part + "_" + str(set_start) + "_" + str(set_end) + ".csv"
df_cad_set.to_csv(output_file, index=False, sep=',') 