In [None]:
import sys
sys.path.append('/tscc/projects/ps-palmer/gwas/GWAS-pipeline/')
from gwas_class_auto import *
from interactiveqc import interactive_QC

## Let's download the a test dataset. In this example we will use the published data from our lab and our colaborators:
 Association Study in 3,173 Outbred Rats Identifies Multiple Loci for Body Weight, Adiposity, and Fasting Glucose. In The Center for GWAS in Outbred Rats Database (C-GORD). UC San Diego Library Digital Collections. https://doi.org/10.6075/J0Q240F0.
 #### phenotypes https://library.ucsd.edu/dc/object/bb9156620z/_2_1.zip/download
#### genotypes prunned https://library.ucsd.edu/dc/object/bb9156620z/_3_1.zip/download

 
this contains 2 files, one with raw data and one with a data dictionary that will be used to tell the pipeline which covariate is related to which trait, 

most likely you'll not have a data dictionary, so to make it more fair let's build it again

second we need an individual ID column. the pipeline requires it to be labeled as 'rfid' and it will be read as a string in all cases. second there must be a column called sex with the M|F encoding (because we have to do some extra treatment for the X and Y chr that is sex dependent)

In [None]:
### this will donwload the raw data and a data dictionary ## the ! is useful to run bash in a jupyter notebook
!wget https://library.ucsd.edu/dc/object/bb9156620z/_2_1.zip/download
!unzip -o download  

### removing unneeded files
os.remove('Obesity_normalized_phenotypes_n3173.csv')
os.remove('trait_ontology.xlsx')
os.remove('download')

In [None]:
!wget https://library.ucsd.edu/dc/object/bb9156620z/_3_1.zip/download
!unzip -o download
!unzip -o LD_pruned_PLINK.zip

### removing unneeded files
os.remove('LD_pruned_PLINK.zip')
os.remove('LD_pruned_0.95.zip')
os.remove('download')

# Awesome, we have our test data, now we need to check formats and make sure it's all good

In [None]:
df = pd.read_csv('Obesity_published_phenotypes_raw_n3173.csv')
print('rfid' in df.columns , 'sex' in df.columns) ### let's check if the essential columns are there
display(df.columns) ### they are not, but we can see that rfid is rat_rfid and sex is sex_mf
## let's rename it then
df.rename({'rat_rfid': 'rfid', 'sex_mf': 'sex'}, axis = 1, inplace = True)
df.dropna(subset = ['sex', 'rfid'], inplace = True)
#df = df.query('center == "MI"') ### lets subset to a single site to lower the number of animals

In [None]:
datadic = generate_datadic(rawdata=df, 
                 trait_prefix='glucose,body_weight',
                 main_cov='sex,center', 
                 save = False,
                 description_dict={'bmi_wo_tail': 'Example of adding better description to bmi_wo_tail trait'}
                ) ### this will give a good start for the data_dictionary file ### 'bmi,body_weight,length,epididymis,glucose,tail,retroperitoneal',
#datadic

In [None]:
#### looking at this we can notice that there is a suffix _age and _technician that are covariates or their respective traits, so we will have to make some adjustments.
#### this reminding that we can do this on excel too if you're not confortable with coding as much, the goal is to be efficient regardsless of the skillset
datadic.loc[datadic.measure.str.contains('_technician$'), 'trait_covariate'] = 'covariate_categorical' # fix tech as a covariate and not trait
datadic.loc[datadic.trait_covariate.str.contains('^trait$'), 'covariates'] += \
    datadic.loc[datadic.trait_covariate.str.contains('^trait$'), 'measure'].map(lambda x: f',{x}_age' if f'{x}_age' in datadic.measure.values else '' ) + \
    datadic.loc[datadic.trait_covariate.str.contains('^trait$'), 'measure'].map(lambda x: f',{x}_technician' if f'{x}_technician' in datadic.measure.values else '' ) 
datadic

In [None]:
datadic.to_csv(f'data_dict_{os.path.basename(os.getcwd())}.csv', index=False) # save data dictionary in the format data_dict_$foldername

In [None]:
qc = interactive_QC(raw_data=df, data_dictionary=datadic)

In [None]:
qc.QC() ### this will pop up an interactive qc that you can use to set thresholds, remove individual rats, and do more strinct filtering 
        ### we make it obligatory to go through all traits before saving necessary files

In [None]:
pd.read_csv('raw_data_curated_n3193_20240917.csv').to_csv('raw_data.csv', index = False)

# Now we can actually run the pipeline

In [None]:
pipeline = gwas_pipe(path = f'', ### this is the path to the folder, we are working in the same folder as the results, so we keep it with an empty string
             all_genotypes = 'P50_round2_LD_pruned_3473', #path to the bim/bam/bam without file format name
             data = df, # this is the dataframe that we have, we could also provide the string to the raw data
             project_name = 'example', # this has to be the same as the basename of the folder
             traits = [], # you can subset the traits, but most of the time following the datadictionary is the best
             genome_accession = 'GCF_000001895.5', 
             # in this case I know the NCBI genome accession, but if you don't know yours just leave it blank and the pipeline will guide you 
             # otherwise you can search it on https://www.ncbi.nlm.nih.gov/datasets/genome/
             threshold = 5.38, # here im setting to a known threshold, if you don't know yours, make the value to be 'auto' and it will estimate it for you but with a pretty high penalty in time
             founderfile = None , # I know the genotypes of the founders in this case, but if you don't, leave it '' or write 'none' '/tscc/projects/ps-palmer/gwas/databases/founder_genotypes/founders7.2'
             phewas_db = 'https://palmerlab.s3.sdsc.edu/tsanches_dash_genotypes/phewas/phewasdb_rn6.parquet.gz', # this is the phewas_database that we will use for querying traits, most likely you don't have one already so you can chose something new 
             threads = 60)


In [None]:
%%time
pipeline.run(round_version='genotypes_test', add_sex_specific_traits = True, clear_directories = True,
            gwas_version = '0.3.0', groupby_animals = ['center'], add_latent_space=False,
            researcher = 'user')