# Calculate the PCAs for the ARHI phenotypes to use in the RAP system

In [9]:
# Common variables Columbia's cluster
UKBB_PATH=$HOME/UKBiobank
UKBB_yale=$HOME/UKBiobank_Yale_transfer
USER_PATH=$HOME/project
container=$HOME/containers/lmm.sif
container_marp=$HOME/containers/marp.sif
container_annovar=$HOME/containers/gatk4-annovar.sif
hearing_pheno_path=$UKBB_PATH/phenotype_files/hearing_impairment
tpl_file=$USER_PATH/bioworkflows/admin/csg.yml
formatFile_fastgwa=$USER_PATH/UKBB_GWAS_dev/data/fastGWA_template.yml
formatFile_bolt=$USER_PATH/UKBB_GWAS_dev/data/boltlmm_template.yml
formatFile_saige=$USER_PATH/UKBB_GWAS_dev/data/saige_template.yml
formatFile_regenie=$USER_PATH/UKBB_GWAS_dev/data/regenie_template.yml

# f3393 

## Step1

In [10]:
## Columbia's cluster
cwd=$UKBB_PATH/results/092821_PCA_results_500K/072523_f3393_pca_500K_imputed_RAP
gwas_sbatch=$UKBB_PATH/results/092821_PCA_results_500K/072523_f3393_pca_500K_imputed_RAP/qc1_f3393_qcarray_$(date +"%Y-%m-%d").sbatch
## Use qc'ed genotype array
genoFile=$UKBB_PATH/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
keep_samples=/mnt/vast/hpc/csg/UKBiobank/phenotype_files/HI_UKBB/072423_ARHI_phenotypes_RAP/072423_UKBB_Haid_f3393_expandedwhite_DMCS_251561ind.tsv
#Keep variants after LD pruning
keep_variants=$UKBB_PATH/results/092821_PCA_results_500K/092821_ldprun_unrelated/cache/*092821_ldprun_unrelated.filtered.prune.in

#GWAS QC variables set all of this variables to 0 to avoid doing more filtering
maf_filter=0
geno_filter=0
hwe_filter=0
mind_filter=0
numThreads=1
mem='30G'
job_size=1
gwasqc_sos=$USER_PATH/xqtl-pipeline/code/data_preprocessing/genotype/GWAS_QC.ipynb

gwasqc_args="""qc:1
    --cwd $cwd
    --genoFile $genoFile
    --keep_samples $keep_samples
    --keep_variants $keep_variants
    --maf_filter $maf_filter
    --geno_filter $geno_filter
    --hwe_filter $hwe_filter
    --mind_filter $mind_filter
    --numThreads $numThreads 
    --job_size $job_size
    --container $container
    --mem $mem
"""

sos run $USER_PATH/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $gwasqc_sos \
    --to-script $gwas_sbatch \
    --args "$gwasqc_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/dmc2245/UKBiobank/results/092821_PCA_results_500K/072523_f3393_pca_500K_imputed_RAP/qc1_f3393_qcarray_2023-07-25.sbatch[0m
INFO: Workflow csg (ID=w1194fc8aef312acf) is executed successfully with 1 completed step.


In [18]:
module load Plink/2.00a
 plink2 --bfile /home/dmc2245/UKBiobank/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted \
  --extract /home/dmc2245/UKBiobank/results/092821_PCA_results_500K/092821_ldprun_unrelated/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.white_europeans.filtered.092821_ldprun_unrelated.filtered.prune.in \
  --keep /mnt/vast/hpc/csg/UKBiobank/phenotype_files/HI_UKBB/072423_ARHI_phenotypes_RAP/072423_UKBB_Haid_f3393_expandedwhite_DMCS_251561ind.tsv \
  --make-bed \
  --memory 27000.0 \
  --new-id-max-allele-len 1000 \
  --out /home/dmc2245/UKBiobank/results/092821_PCA_results_500K/072523_f3393_pca_500K_imputed_RAP/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.filtered.extracted \
  --threads 1

PLINK v2.00a4LM 64-bit Intel (11 Apr 2023)     www.cog-genomics.org/plink/2.0/
(C) 2005-2023 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to /home/dmc2245/UKBiobank/results/092821_PCA_results_500K/072523_f3393_pca_500K_imputed_RAP/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.filtered.extracted.log.
Options in effect:
  --bfile /home/dmc2245/UKBiobank/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted
  --extract /home/dmc2245/UKBiobank/results/092821_PCA_results_500K/092821_ldprun_unrelated/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.white_europeans.filtered.092821_ldprun_unrelated.filtered.prune.in
  --keep /mnt/vast/hpc/csg/UKBiobank/phenotype_files/HI_UKBB/072423_ARHI_phenotypes_RAP/072423_UKBB_Haid_f3393_expandedwhite_DMCS_251561ind.tsv


## Step 2

In [22]:
## Columbia's cluster
cwd=$UKBB_PATH/results/092821_PCA_results_500K/072523_f3393_pca_500K_imputed_RAP
#This is the bfile obtained in step 1
genoFile=$UKBB_PATH/results/092821_PCA_results_500K/072523_f3393_pca_500K_imputed_RAP/*.bed
# Format FID, IID, pop
phenoFile=/mnt/vast/hpc/csg/UKBiobank/phenotype_files/HI_UKBB/072423_ARHI_phenotypes_RAP/072423_UKBB_Haid_f3393_expandedwhite_DMCS_251561ind.phenopca
label_col=ethnicity
pop_col=ethnicity
pca_sbatch=$UKBB_PATH/results/092821_PCA_results_500K/072523_f3393_pca_500K_imputed_RAP/flashpca_f3393_pc_$(date +"%Y-%m-%d").sbatch
k=4
min_axis=""
max_axis=""
job_size=1
pca_sos=$USER_PATH/xqtl-pipeline/code/data_preprocessing/genotype/PCA.ipynb

pca_args="""flashpca
    --cwd $cwd
    --genoFile $genoFile
    --phenoFile $phenoFile
    --k $k
    --label_col $label_col
    --pop_col $pop_col
    --min_axis $min_axis
    --max_axis $max_axis
    --numThreads $numThreads 
    --job_size $job_size
    --container $container
"""

sos run $USER_PATH/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $pca_sos \
    --to-script $pca_sbatch \
    --args "$pca_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/dmc2245/UKBiobank/results/092821_PCA_results_500K/072523_f3393_pca_500K_imputed_RAP/flashpca_f3393_pc_2023-07-26.sbatch[0m
INFO: Workflow csg (ID=wfce1448e339e5fd7) is executed successfully with 1 completed step.


# f2247

## Step 1

In [16]:
## Columbia's cluster
cwd=$UKBB_PATH/results/092821_PCA_results_500K/072523_f2247_pca_500K_imputed_RAP
gwas_sbatch=$UKBB_PATH/results/092821_PCA_results_500K/072523_f2247_pca_500K_imputed_RAP/qc1_f2247_qcarray_$(date +"%Y-%m-%d").sbatch
## Use qc'ed genotype array
genoFile=$UKBB_PATH/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
keep_samples=/mnt/vast/hpc/csg/UKBiobank/phenotype_files/HI_UKBB/072423_ARHI_phenotypes_RAP/072423_UKBB_Hdiff_2247_expandedwhite_DMCS_346667ind.phenopca
#Keep variants after LD pruning
keep_variants=$UKBB_PATH/results/092821_PCA_results_500K/092821_ldprun_unrelated/cache/*092821_ldprun_unrelated.filtered.prune.in

#GWAS QC variables set all of this variables to 0 to avoid doing more filtering
maf_filter=0
geno_filter=0
hwe_filter=0
mind_filter=0
numThreads=1
mem='30G'
job_size=1
gwasqc_sos=$USER_PATH/xqtl-pipeline/code/data_preprocessing/genotype/GWAS_QC.ipynb

gwasqc_args="""qc:1
    --cwd $cwd
    --genoFile $genoFile
    --keep_samples $keep_samples
    --keep_variants $keep_variants
    --maf_filter $maf_filter
    --geno_filter $geno_filter
    --hwe_filter $hwe_filter
    --mind_filter $mind_filter
    --numThreads $numThreads 
    --job_size $job_size
    --container $container
    --mem $mem
"""

sos run $USER_PATH/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $gwasqc_sos \
    --to-script $gwas_sbatch \
    --args "$gwasqc_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/dmc2245/UKBiobank/results/092821_PCA_results_500K/072523_f2247_pca_500K_imputed_RAP/qc1_f2247_qcarray_2023-07-25.sbatch[0m
INFO: Workflow csg (ID=w84f413ab96248cc3) is executed successfully with 1 completed step.


In [None]:
#!/bin/sh
#$ -l h_rt=36:00:00
#$ -l h_vmem=3G
#$ -N UKBB_Hdiff_2247_expandedwhite_DMCS_346667ind
#$ -o /home/dmc2245/UKBiobank/results/092821_PCA_results_500K/072523_f2247_pca_500K_imputed_RAP/qc1_f2247_qcarray-$JOB_ID.out
#$ -e /home/dmc2245/UKBiobank/results/092821_PCA_results_500K/072523_f2247_pca_500K_imputed_RAP/qc1_f2247_qcarray-$JOB_ID.err
#$ -j y
#$ -q csg.q
#$ -S /bin/bash

export PATH=$HOME/miniconda3/bin:$PATH
module load Plink/2.00a
 plink2 --bfile /home/dmc2245/UKBiobank/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted \
  --extract /home/dmc2245/UKBiobank/results/092821_PCA_results_500K/092821_ldprun_unrelated/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.white_europeans.filtered.092821_ldprun_unrelated.filtered.prune.in \
  --keep /mnt/vast/hpc/csg/UKBiobank/phenotype_files/HI_UKBB/072423_ARHI_phenotypes_RAP/072423_UKBB_Hdiff_2247_expandedwhite_DMCS_346667ind.phenopca \
  --make-bed \
  --memory 27000.0 \
  --new-id-max-allele-len 1000 \
  --out /home/dmc2245/UKBiobank/results/092821_PCA_results_500K/072523_f2247_pca_500K_imputed_RAP/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.filtered.extracted \
  --threads 1

## Step 2

In [23]:
## Columbia's cluster
cwd=$UKBB_PATH/results/092821_PCA_results_500K/072523_f2247_pca_500K_imputed_RAP
pca_sbatch=$UKBB_PATH/results/092821_PCA_results_500K/072523_f2247_pca_500K_imputed_RAP/flashpca_f2247_pc_$(date +"%Y-%m-%d").sbatch
#This is the bfile obtained in step 1
genoFile=$UKBB_PATH/results/092821_PCA_results_500K/072523_f2247_pca_500K_imputed_RAP/*.bed
# Format FID, IID, pop
phenoFile=/mnt/vast/hpc/csg/UKBiobank/phenotype_files/HI_UKBB/072423_ARHI_phenotypes_RAP/072423_UKBB_Hdiff_2247_expandedwhite_DMCS_346667ind.phenopca
label_col=ethnicity
pop_col=ethnicity
k=4
min_axis=""
max_axis=""
job_size=1
pca_sos=$USER_PATH/xqtl-pipeline/code/data_preprocessing/genotype/PCA.ipynb

pca_args="""flashpca
    --cwd $cwd
    --genoFile $genoFile
    --phenoFile $phenoFile
    --k $k
    --label_col $label_col
    --pop_col $pop_col
    --min_axis $min_axis
    --max_axis $max_axis
    --numThreads $numThreads 
    --job_size $job_size
    --container $container
"""

sos run  $USER_PATH/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg\
    --template-file $tpl_file \
    --workflow-file $pca_sos \
    --to-script $pca_sbatch \
    --args "$pca_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/dmc2245/UKBiobank/results/092821_PCA_results_500K/072523_f2247_pca_500K_imputed_RAP/flashpca_f2247_pc_2023-07-26.sbatch[0m
INFO: Workflow csg (ID=w13b49e1e3aa46d36) is executed successfully with 1 completed step.


# f2257

## Step 1

In [None]:
## Columbia's cluster
cwd=$UKBB_PATH/results/092821_PCA_results_500K/072523_f2257_pca_500K_imputed_RAP
gwas_sbatch=$UKBB_PATH/results/092821_PCA_results_500K/072523_f2257_pca_500K_imputed_RAP/qc1_f2257_qcarray_$(date +"%Y-%m-%d").sbatch
## Use qc'ed genotype array
genoFile=$UKBB_PATH/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
keep_samples=/mnt/vast/hpc/csg/UKBiobank/phenotype_files/HI_UKBB/072423_ARHI_phenotypes_RAP/072423_UKBB_Hnoise_f2257_expandedwhite_DMCS_397027ind.phenopca
#Keep variants after LD pruning
keep_variants=$UKBB_PATH/results/092821_PCA_results_500K/092821_ldprun_unrelated/cache/*092821_ldprun_unrelated.filtered.prune.in

#GWAS QC variables set all of this variables to 0 to avoid doing more filtering
maf_filter=0
geno_filter=0
hwe_filter=0
mind_filter=0
numThreads=1
mem='30G'
job_size=1
gwasqc_sos=$USER_PATH/xqtl-pipeline/code/data_preprocessing/genotype/GWAS_QC.ipynb

gwasqc_args="""qc:1
    --cwd $cwd
    --genoFile $genoFile
    --keep_samples $keep_samples
    --keep_variants $keep_variants
    --maf_filter $maf_filter
    --geno_filter $geno_filter
    --hwe_filter $hwe_filter
    --mind_filter $mind_filter
    --numThreads $numThreads 
    --job_size $job_size
    --container $container
    --mem $mem
"""

sos run $USER_PATH/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $gwasqc_sos \
    --to-script $gwas_sbatch \
    --args "$gwasqc_args"

In [None]:
#!/bin/sh
#$ -l h_rt=36:00:00
#$ -l h_vmem=3G
#$ -N UKBB_Hnoise_f2257_expandedwhite_DMCS_397027ind
#$ -o /home/dmc2245/UKBiobank/results/092821_PCA_results_500K/072523_f2257_pca_500K_imputed_RAP/qc1_f2257_qcarray-$JOB_ID.out
#$ -e /home/dmc2245/UKBiobank/results/092821_PCA_results_500K/072523_f2257_pca_500K_imputed_RAP/qc1_f2257_qcarray-$JOB_ID.err
#$ -j y
#$ -q csg.q
#$ -S /bin/bash

export PATH=$HOME/miniconda3/bin:$PATH
module load Plink/2.00a
 plink2 --bfile /home/dmc2245/UKBiobank/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted \
  --extract /home/dmc2245/UKBiobank/results/092821_PCA_results_500K/092821_ldprun_unrelated/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.white_europeans.filtered.092821_ldprun_unrelated.filtered.prune.in \
  --keep /mnt/vast/hpc/csg/UKBiobank/phenotype_files/HI_UKBB/072423_ARHI_phenotypes_RAP/072423_UKBB_Hnoise_f2257_expandedwhite_DMCS_397027ind.phenopca \
  --make-bed \
  --memory 27000.0 \
  --new-id-max-allele-len 1000 \
  --out /home/dmc2245/UKBiobank/results/092821_PCA_results_500K/072523_f2257_pca_500K_imputed_RAP/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.filtered.extracted \
  --threads 10

## Step 2

In [24]:
## Columbia's cluster
cwd=$UKBB_PATH/results/092821_PCA_results_500K/072523_f2257_pca_500K_imputed_RAP
pca_sbatch=$UKBB_PATH/results/092821_PCA_results_500K/072523_f2257_pca_500K_imputed_RAP/flashpca_f2257_pc_$(date +"%Y-%m-%d").sbatch
#This is the bfile obtained in step 1
genoFile=$UKBB_PATH/results/092821_PCA_results_500K/072523_f2257_pca_500K_imputed_RAP/*.bed
# Format FID, IID, pop
phenoFile=/mnt/vast/hpc/csg/UKBiobank/phenotype_files/HI_UKBB/072423_ARHI_phenotypes_RAP/072423_UKBB_Hnoise_f2257_expandedwhite_DMCS_397027ind.phenopca
label_col=ethnicity
pop_col=ethnicity
k=4
min_axis=""
max_axis=""
job_size=1
pca_sos=$USER_PATH/xqtl-pipeline/code/data_preprocessing/genotype/PCA.ipynb

pca_args="""flashpca
    --cwd $cwd
    --genoFile $genoFile
    --phenoFile $phenoFile
    --k $k
    --label_col $label_col
    --pop_col $pop_col
    --min_axis $min_axis
    --max_axis $max_axis
    --numThreads $numThreads 
    --job_size $job_size
    --container $container
"""

sos run  $USER_PATH/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg\
    --template-file $tpl_file \
    --workflow-file $pca_sos \
    --to-script $pca_sbatch \
    --args "$pca_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/dmc2245/UKBiobank/results/092821_PCA_results_500K/072523_f2257_pca_500K_imputed_RAP/flashpca_f2257_pc_2023-07-26.sbatch[0m
INFO: Workflow csg (ID=wd41ad62f6549be09) is executed successfully with 1 completed step.


# Combined f.2247 & f.2257

## Step 1

In [12]:
## Columbia's cluster
cwd=$UKBB_PATH/results/092821_PCA_results_500K/072523_f2247_f2257_pca_500K_imputed_RAP
#gwas_sbatch=$USER_PATH/UKBB_GWAS_dev/output/qc1_f2247_f2257_qcarray_$(date +"%Y-%m-%d").sbatch
gwas_sbatch=$UKBB_PATH/results/092821_PCA_results_500K/072523_f2247_f2257_pca_500K_imputed_RAP/qc1_f2247_f2257_qcarray_$(date +"%Y-%m-%d").sbatch
## Use qc'ed genotype array
genoFile=$UKBB_PATH/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
keep_samples=/mnt/vast/hpc/csg/UKBiobank/phenotype_files/HI_UKBB/072423_ARHI_phenotypes_RAP/072423_UKBB_Hboth_2247_2257_expandedwhite_DMCS_329552ind.phenopca
#Keep variants after LD pruning
keep_variants=$UKBB_PATH/results/092821_PCA_results_500K/092821_ldprun_unrelated/cache/*092821_ldprun_unrelated.filtered.prune.in

#GWAS QC variables set all of this variables to 0 to avoid doing more filtering
maf_filter=0
geno_filter=0
hwe_filter=0
mind_filter=0
numThreads=1
mem='30G'
job_size=1
gwasqc_sos=$USER_PATH/xqtl-pipeline/code/data_preprocessing/genotype/GWAS_QC.ipynb

gwasqc_args="""qc:1
    --cwd $cwd
    --genoFile $genoFile
    --keep_samples $keep_samples
    --keep_variants $keep_variants
    --maf_filter $maf_filter
    --geno_filter $geno_filter
    --hwe_filter $hwe_filter
    --mind_filter $mind_filter
    --numThreads $numThreads 
    --job_size $job_size
    --container $container
    --mem $mem
"""

sos run $USER_PATH/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $gwasqc_sos \
    --to-script $gwas_sbatch \
    --args "$gwasqc_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/tf2478/hearing/qc1_f2247_f2257_qcarray_2021-10-05.sbatch[0m
INFO: Workflow csg (ID=w8e0d9b17c271ad03) is executed successfully with 1 completed step.


In [None]:
#!/bin/sh
#$ -l h_rt=36:00:00
#$ -l h_vmem=3G
#$ -N UKBB_Hboth_2247_2257_expandedwhite_DMCS_329552ind
#$ -o /home/dmc2245/UKBiobank/results/092821_PCA_results_500K/072523_f2247_f2257_pca_500K_imputed_RAP/qc1_f2247_f2257_qcarray-$JOB_ID.out
#$ -e /home/dmc2245/UKBiobank/results/092821_PCA_results_500K/072523_f2247_f2257_pca_500K_imputed_RAP/qc1_f2247_f2257_qcarray-$JOB_ID.err
#$ -j y
#$ -q csg.q
#$ -S /bin/bash

export PATH=$HOME/miniconda3/bin:$PATH
module load Plink/2.00a
 plink2 --bfile /home/dmc2245/UKBiobank/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted \
  --extract /home/dmc2245/UKBiobank/results/092821_PCA_results_500K/092821_ldprun_unrelated/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.white_europeans.filtered.092821_ldprun_unrelated.filtered.prune.in \
  --keep /mnt/vast/hpc/csg/UKBiobank/phenotype_files/HI_UKBB/072423_ARHI_phenotypes_RAP/072423_UKBB_Hboth_2247_2257_expandedwhite_DMCS_329552ind.phenopca \
  --make-bed \
  --memory 27000.0 \
  --new-id-max-allele-len 1000 \
  --out /home/dmc2245/UKBiobank/results/092821_PCA_results_500K/072523_f2247_f2257_pca_500K_imputed_RAP/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.filtered.extracted \
  --threads 10

## Step 2.

In [25]:
## Columbia's cluster
cwd=$UKBB_PATH/results/092821_PCA_results_500K/072523_f2247_f2257_pca_500K_imputed_RAP
#This is the bfile obtained in step 1
genoFile=$UKBB_PATH/results/092821_PCA_results_500K/072523_f2247_f2257_pca_500K_imputed_RAP/*.bed
# Format FID, IID, pop
phenoFile=/mnt/vast/hpc/csg/UKBiobank/phenotype_files/HI_UKBB/072423_ARHI_phenotypes_RAP/072423_UKBB_Hboth_2247_2257_expandedwhite_DMCS_329552ind.phenopca
label_col=ethnicity
pop_col=ethnicity
pca_sbatch=$UKBB_PATH/results/092821_PCA_results_500K/072523_f2247_f2257_pca_500K_imputed_RAP/flashpca_f2247_f2257_pc_$(date +"%Y-%m-%d").sbatch
k=4
min_axis=""
max_axis=""
job_size=1
pca_sos=$USER_PATH/xqtl-pipeline/code/data_preprocessing/genotype/PCA.ipynb

pca_args="""flashpca
    --cwd $cwd
    --genoFile $genoFile
    --phenoFile $phenoFile
    --k $k
    --label_col $label_col
    --pop_col $pop_col
    --min_axis $min_axis
    --max_axis $max_axis
    --numThreads $numThreads 
    --job_size $job_size
    --container $container
"""

sos run  $USER_PATH/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg\
    --template-file $tpl_file \
    --workflow-file $pca_sos \
    --to-script $pca_sbatch \
    --args "$pca_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/dmc2245/UKBiobank/results/092821_PCA_results_500K/072523_f2247_f2257_pca_500K_imputed_RAP/flashpca_f2247_f2257_pc_2023-07-26.sbatch[0m
INFO: Workflow csg (ID=w5dc842eb08c3492f) is executed successfully with 1 completed step.
