In [1]:
import sys
sys.path.append('/tscc/projects/ps-palmer/gwas/GWAS-pipeline/')
from gwas_class_auto import *
from interactiveqc import interactive_QC

## Let's download the a test dataset. In this example we will use the published data from our lab and our colaborators:
 Association Study in 3,173 Outbred Rats Identifies Multiple Loci for Body Weight, Adiposity, and Fasting Glucose. In The Center for GWAS in Outbred Rats Database (C-GORD). UC San Diego Library Digital Collections. https://doi.org/10.6075/J0Q240F0.
 #### phenotypes https://library.ucsd.edu/dc/object/bb9156620z/_2_1.zip/download
#### genotypes prunned https://library.ucsd.edu/dc/object/bb9156620z/_3_1.zip/download

 
this contains 2 files, one with raw data and one with a data dictionary that will be used to tell the pipeline which covariate is related to which trait, 

most likely you'll not have a data dictionary, so to make it more fair let's build it again

second we need an individual ID column. the pipeline requires it to be labeled as 'rfid' and it will be read as a string in all cases. second there must be a column called sex with the M|F encoding (because we have to do some extra treatment for the X and Y chr that is sex dependent)

In [2]:
### this will donwload the raw data and a data dictionary ## the ! is useful to run bash in a jupyter notebook
!wget https://library.ucsd.edu/dc/object/bb9156620z/_2_1.zip/download
!unzip -o download  

### removing unneeded files
os.remove('Obesity_normalized_phenotypes_n3173.csv')
os.remove('trait_ontology.xlsx')
os.remove('download')

--2024-09-24 08:04:19--  https://library.ucsd.edu/dc/object/bb9156620z/_2_1.zip/download
Resolving library.ucsd.edu (library.ucsd.edu)... 132.239.119.5
Connecting to library.ucsd.edu (library.ucsd.edu)|132.239.119.5|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 424720 (415K) [application/zip]
Saving to: ‘download’


2024-09-24 08:04:19 (123 MB/s) - ‘download’ saved [424720/424720]

Archive:  download
  inflating: Obesity_normalized_phenotypes_n3173.csv  
  inflating: Obesity_published_phenotypes_raw_n3173.csv  
  inflating: trait_ontology.xlsx     


In [3]:
!wget https://library.ucsd.edu/dc/object/bb9156620z/_3_1.zip/download
!unzip -o download
!unzip -o LD_pruned_PLINK.zip

### removing unneeded files
os.remove('LD_pruned_PLINK.zip')
os.remove('LD_pruned_0.95.zip')
os.remove('download')

--2024-09-24 08:04:20--  https://library.ucsd.edu/dc/object/bb9156620z/_3_1.zip/download
Resolving library.ucsd.edu (library.ucsd.edu)... 132.239.119.5
Connecting to library.ucsd.edu (library.ucsd.edu)|132.239.119.5|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 459447050 (438M) [application/zip]
Saving to: ‘download’


2024-09-24 08:04:25 (97.2 MB/s) - ‘download’ saved [459447050/459447050]

Archive:  download
 extracting: LD_pruned_0.95.zip      
 extracting: LD_pruned_PLINK.zip     
Archive:  LD_pruned_PLINK.zip
  inflating: P50_round2_LD_pruned_3473.bed  
  inflating: P50_round2_LD_pruned_3473.bim  
  inflating: P50_round2_LD_pruned_3473.fam  


# Awesome, we have our test data, now we need to check formats and make sure it's all good

In [4]:
df = pd.read_csv('Obesity_published_phenotypes_raw_n3173.csv')
print('rfid' in df.columns , 'sex' in df.columns) ### let's check if the essential columns are there
display(df.columns) ### they are not, but we can see that rfid is rat_rfid and sex is sex_mf
## let's rename it then
df.rename({'rat_rfid': 'rfid', 'sex_mf': 'sex'}, axis = 1, inplace = True)
df.dropna(subset = ['sex', 'rfid'], inplace = True)
#df = df.query('center == "MI"') ### lets subset to a single site to lower the number of animals

False False


Index(['rat_rfid', 'sex_mf', 'center', 'bmi_wo_tail', 'bmi_wo_tail_age', 'bmi_wo_tail_technician', 'bmi_w_tail', 'bmi_w_tail_age', 'bmi_w_tail_technician', 'body_weight_g', 'body_weight_g_age', 'body_weight_g_technician', 'epididymis_fat_weight_g', 'epididymis_fat_weight_g_age', 'epididymis_fat_weight_g_technician', 'glucose_reading_mg_dl', 'glucose_reading_mg_dl_age', 'glucose_reading_mg_dl_technician', 'length_wo_tail_cm', 'length_wo_tail_cm_age', 'length_wo_tail_cm_technician', 'length_w_tail_cm', 'length_w_tail_cm_age', 'length_w_tail_cm_technician', 'parametrial_fat_weight_g', 'parametrial_fat_weight_g_age', 'parametrial_fat_weight_g_technician', 'retroperitoneal_fat_weight_g_age', 'retroperitoneal_fat_weight_g_technician', 'tail_length', 'tail_length_age', 'tail_length_technician', 'idx', 'retroperitoneal_fat_weight_g'], dtype='object')

In [5]:
datadic = generate_datadic(rawdata=df, 
                 trait_prefix='glucose,body_weight',
                 main_cov='sex,center', 
                 save = False,
                 description_dict={'bmi_wo_tail': 'Example of adding better description to bmi_wo_tail trait'}
                ) ### this will give a good start for the data_dictionary file ### 'bmi,body_weight,length,epididymis,glucose,tail,retroperitoneal',
#datadic

In [6]:
#### looking at this we can notice that there is a suffix _age and _technician that are covariates or their respective traits, so we will have to make some adjustments.
#### this reminding that we can do this on excel too if you're not confortable with coding as much, the goal is to be efficient regardsless of the skillset
datadic.loc[datadic.measure.str.contains('_technician$'), 'trait_covariate'] = 'covariate_categorical' # fix tech as a covariate and not trait
datadic.loc[datadic.trait_covariate.str.contains('^trait$'), 'covariates'] += \
    datadic.loc[datadic.trait_covariate.str.contains('^trait$'), 'measure'].map(lambda x: f',{x}_age' if f'{x}_age' in datadic.measure.values else '' ) + \
    datadic.loc[datadic.trait_covariate.str.contains('^trait$'), 'measure'].map(lambda x: f',{x}_technician' if f'{x}_technician' in datadic.measure.values else '' ) 
datadic

Unnamed: 0,measure,count,unique,top,freq,mean,std,min,25%,50%,75%,max,trait_covariate,covariates,description
0,rfid,3193.0,3193.0,0007929BF5,1.0,,,,,,,,metadata,,rfid
1,sex,3193.0,2.0,M,1618.0,,,,,,,,covariate_categorical,,sex
2,center,3193.0,4.0,MI,1038.0,,,,,,,,covariate_categorical,,center
3,bmi_wo_tail,3164.0,,,,6.303489,1.229881,2.770919,5.425,6.065145,7.008404,11.101235,covariate_continuous,,Example of adding better description to bmi_wo...
4,bmi_wo_tail_age,3164.0,,,,123.250316,55.796602,51.0,83.0,92.0,187.0,278.0,covariate_continuous,,bmi_wo_tail_age
5,bmi_wo_tail_technician,3164.0,15.0,TW,934.0,,,,,,,,covariate_categorical,,bmi_wo_tail_technician
6,bmi_w_tail,3161.0,,,,1.762944,0.310829,0.988242,1.524709,1.709009,1.946502,4.106423,covariate_continuous,,bmi_w_tail
7,bmi_w_tail_age,3161.0,,,,123.239165,55.798749,51.0,83.0,92.0,187.0,278.0,covariate_continuous,,bmi_w_tail_age
8,bmi_w_tail_technician,3161.0,15.0,TW,931.0,,,,,,,,covariate_categorical,,bmi_w_tail_technician
9,body_weight_g,3164.0,,,,263.811315,84.739467,121.0,196.0,244.0,313.0,606.0,trait,"sex,center,body_weight_g_age,body_weight_g_tec...",body_weight_g


In [7]:
datadic.to_csv(f'data_dict_{os.path.basename(os.getcwd())}.csv', index=False) # save data dictionary in the format data_dict_$foldername

In [8]:
qc = interactive_QC(raw_data=df, data_dictionary=datadic)

  self.dfog[self.traits] = self.dfog[self.traits].applymap(lambda x: (str(x).replace(' ', '').replace('#DIV/0!', 'nan').replace('#VALUE!', 'nan')
100%|██████████| 2/2 [00:00<00:00, 144.00it/s]


In [9]:
qc.QC() ### this will pop up an interactive qc that you can use to set thresholds, remove individual rats, and do more strinct filtering 
        ### we make it obligatory to go through all traits before saving necessary files

interactive(children=(Dropdown(description='single_trait', options=('body_weight_g', 'glucose_reading_mg_dl'),…

In [10]:
pd.read_csv('raw_data_curated_n3193_20240917.csv').to_csv('raw_data.csv', index = False)

# Now we can actually run the pipeline

In [11]:
pipeline = gwas_pipe(path = f'', ### this is the path to the folder, we are working in the same folder as the results, so we keep it with an empty string
             all_genotypes = 'P50_round2_LD_pruned_3473', #path to the bim/bam/bam without file format name
             data = df, # this is the dataframe that we have, we could also provide the string to the raw data
             project_name = 'example', # this has to be the same as the basename of the folder
             traits = [], # you can subset the traits, but most of the time following the datadictionary is the best
             genome_accession = 'GCF_000001895.5', 
             # in this case I know the NCBI genome accession, but if you don't know yours just leave it blank and the pipeline will guide you 
             # otherwise you can search it on https://www.ncbi.nlm.nih.gov/datasets/genome/
             threshold = 5.38, # here im setting to a known threshold, if you don't know yours, make the value to be 'auto' and it will estimate it for you but with a pretty high penalty in time
             founderfile = None , # I know the genotypes of the founders in this case, but if you don't, leave it '' or write 'none' '/tscc/projects/ps-palmer/gwas/databases/founder_genotypes/founders7.2'
             phewas_db = 'https://palmerlab.s3.sdsc.edu/tsanches_dash_genotypes/phewas/phewasdb_rn6.parquet.gz', # this is the phewas_database that we will use for querying traits, most likely you don't have one already so you can chose something new 
             threads = 60)


rm -r temp
importing traits from data_dict_example.csv
body_weight_g            3164
glucose_reading_mg_dl    2246
dtype: int64
importing trait descriptions from data_dict_example.csv


In [12]:
%%time
pipeline.run(round_version='genotypes_test', add_sex_specific_traits = True, clear_directories = True,
            gwas_version = '0.3.0', groupby_animals = ['center'], add_latent_space=False,
            researcher = 'user')