# <center>PCA analysis</center>

This page documents the PCA analysis for the following groups:<br>&nbsp;&nbsp;&nbsp;&nbsp; 1. All Africans <br>&nbsp;&nbsp;&nbsp;&nbsp; 2. All Europeans <br>&nbsp;&nbsp;&nbsp;&nbsp; 3. SJLIFE Europeans <br>&nbsp;&nbsp;&nbsp;&nbsp; 4. CCSS Europeans 
<p>Inputs<br>&nbsp;&nbsp;&nbsp;&nbsp;1. Informative SNPs used for PCA analysis.<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Four columns:<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;1st column is SNP name<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;2nd column is chromosome<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;3rd column is genetic position (in Morgans).  If unknown, ok to set to 0.0<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;4th column is physical position (in bases)<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Optional 5th and 6th columns are reference and variant alleles<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;2. bcfDir (The directory where bcf files with GT are stored)<br>&nbsp;&nbsp;&nbsp;&nbsp;3. sample file under working directory for each of the following groups:<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;SJLIFE European (sample.ceu.sjlife.ind) <br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;CCSS European (sample.ceu.ccss.ind)<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;ALL European (sample.ceu.sjlife.ccss.ind)<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;All Afrian (sample.ceu.sjlife.ccss.ind)<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<font color = red>3 columns<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;1st column is sample ID<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;2nd column is gender (M or F).  If unknown, ok to set to U for Unknown<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;3rd column is a label which might refer to Case or Control status, or might be a population group label</font><br>&nbsp;&nbsp;&nbsp;&nbsp;4. sampleMasterFile (all samples in the same order as in bcf file)<p>Outputs<br>&nbsp;&nbsp;&nbsp;&nbsp;african.sjlife.ccss<br>&nbsp;&nbsp;&nbsp;&nbsp;european.ccss<br>&nbsp;&nbsp;&nbsp;&nbsp;european.sjlife<br>&nbsp;&nbsp;&nbsp;&nbsp;european.sjlife.ccss

## 1. Software install and setup 

I. Follow the steps from https://github.com/DReichLab/EIG to install smartpca for PCA analysis.<br>II. Add EIG tools to your $PATH <br>&nbsp;&nbsp;&nbsp;&nbsp;<font color = red>Load openblas everytime before you start jupyter on hpc <br>&nbsp;&nbsp;&nbsp;&nbsp;module load openblas/dynamic/0.2.18</font>

## 2. working directory and required file setup

In [5]:
import os,sys
import subprocess as sp

#working directory
workingDir = '/research/rgs01/resgen/legacy/gb_customTracks/tp/jwang/TASK/survivorship/PCA2' #pragma: allowlist secret

#Informative SNPs used for PCA analysis
snpFile = 'pca.snp'

#bcfDir (The directory where bcf files with GT are stored)
bcfDir = '/research/rgs01/resgen/legacy/gb_customTracks/tp/files/hg38/sjlife/bcf/INFOGT/'

#sample list files
samplePops = {'ceu':
              {
                  'sjlife' : os.path.join(workingDir,'sample.ceu.sjlife.ind'),
                  'ccss' : os.path.join(workingDir,'sample.ceu.ccss.ind'),
                  'sjlife.ccss' : os.path.join(workingDir,'sample.ceu.sjlife.ccss.ind')
              },
              'yri': {'sjlife.ccss' : os.path.join(workingDir,'sample.yri.sjlife.ccss.ind')}
             }
samplePops = {'yri': {'sjlife.ccss' : os.path.join(workingDir,'sample.yri.sjlife.ccss.ind')}}

#sampleMasterFile
sampleMasterFile = '/research/rgs01/resgen/legacy/gb_customTracks/tp/jwang/TASK/survivorship/survivorship_portal_sample'

#check if smartpca can be found from $PATH
EIGCK = True
ckrt = sp.run('smartpca',shell=True,stderr=sp.PIPE).stderr.decode('utf-8')
if 'command not found' in ckrt:
    EIGCK = False
    print('smartpca can not be found from your $PATH')
    sys.exit(1)
#PCA output file
PCAFile = {'ceu':
              {
                  'sjlife' : os.path.join(workingDir,'european.sjlife'),
                  'ccss' : os.path.join(workingDir,'european.ccss'),
                  'sjlife.ccss' : os.path.join(workingDir,'european.sjlife.ccss')
              },
              'yri': {'sjlife.ccss' : os.path.join(workingDir,'african.sjlife.ccss')}
          }


## smartpca version: 18140


## 3. generate genotype file

In [2]:
import os,re
import subprocess as sp

if not os.path.isfile('pca.snp'):
    print('Cound not find snp file: "pca.snp"')
if not os.path.isfile('genotypeExtract.py'):
    print('run "Tools used for prs computing" to create genotypeExtract.py')
if not os.path.isfile('genotypeGen.py'):
    print('run "Tools used for prs computing" to create genotypeGen.py')

#extract genotype from bcf file   
extGTJob = {}
for p in samplePops:
    for c in samplePops[p]:
        samfile = samplePops[p][c]
        extGTCommand = 'bsub -q standard'
        extGTCommand += ' -eo ' + os.path.join(workingDir,p + '_' + c + '.extGT.elog')
        extGTCommand += ' -oo ' + os.path.join(workingDir,p + '_' + c + '.extGT.log')
        extGTCommand += ' python3 genotypeExtract.py'
        #pca.snp
        extGTCommand += ' pca.snp'
        #sample
        extGTCommand += ' ' + samfile
        #bcf file dir
        extGTCommand += ' ' + bcfDir
        #sample master file
        extGTCommand += ' ' + sampleMasterFile
        #output file
        extGTCommand += ' ' + samfile + '.gt'
        jobRt = sp.run(extGTCommand, shell=True, stdout=sp.PIPE).stdout.decode('utf-8')
        if not p in extGTJob:
            extGTJob[p] = {}
        extGTJob[p][c] = re.search('<(\d+?)>',jobRt).group(1)

#generate genotype file
for p in samplePops:
    for c in samplePops[p]:
        samfile = samplePops[p][c]
        genoCommand = 'bsub -q standard'
        genoCommand += ' -eo ' + os.path.join(workingDir,p + '_' + c + '.geno.elog')
        genoCommand += ' -oo ' + os.path.join(workingDir,p + '_' + c + '.geno.log')
        genoCommand += ' -w "done('+extGTJob[p][c]+')"'
        genoCommand += ' python3 genotypeGen.py'
        #pca.snp
        genoCommand += ' pca.snp'
        #gt file
        genoCommand += ' ' + samfile + '.gt'
        #sample
        genoCommand += ' ' + samfile
        #output file
        genoCommand += ' ' + samfile + '.geno'
        os.system(genoCommand)

        

      A default memory request of 2.50 GB has been placed for this job
      The job will be killed if   2.50 GB of memory is used
      A default memory request of 2.50 GB has been placed for this job
      The job will be killed if   2.50 GB of memory is used


Job <174250754> is submitted to queue <standard>.


## 4. run smartpca 

In [3]:
import os,sys

if not EIGCK:
    print('smartpca can not be found from your $PATH')
    sys.exit(1)
    
def OUTPAR(samf,snpf):
    out = open(samf+'.pca.par','w')
    out.write('genotypename: '+samf+'.geno\n')
    out.write('snpname: ' + snpf + '\n')
    out.write('indivname: ' + samf + '\n')
    out.write('evecoutname: ' + samf + '.pca.evec\n')
    out.write('evaloutname: ' + samf + '.eval\n')
    out.write('altnormstyle: NO\nnumoutevec: 10\nnumoutlieriter: 0\nnumoutlierevec: 10\noutliersigmathresh: 6\nqtmode: 0\n')
    out.close()
    return samf+'.pca.par'
for p in samplePops:
    for c in samplePops[p]:
        samfile = samplePops[p][c]
        parfile = OUTPAR(samfile,snpFile)
        pcaCommand = 'bsub -q standard -R "rusage[mem=20000]"'
        pcaCommand += ' -eo ' +samfile+'.pca.elog'
        pcaCommand += ' -oo ' +samfile+'.pca.log'
        pcaCommand += ' -J PCA_'+p+'_'+c
        pcaCommand += ' smartpca -p '+parfile
        os.system(pcaCommand)
        print('job submmited for PCA analysis: '+p+' '+c)



Job <174273450> is submitted to queue <standard>.
job submmited for PCA analysis: yri sjlife.ccss


## 5. generate PCA file with integer sample ID

In [6]:
import os

if not os.path.isfile('genPCA.py'):
    print('run "Tools used for prs computing" to create genPCA.py')

for p in samplePops:
    for c in samplePops[p]:
        pcaReformCommand = 'bsub -q standard -R "rusage[mem=20000]"'
        pcaReformCommand += ' -eo ' +samfile+'.pcaReform.elog'
        pcaReformCommand += ' -oo ' +samfile+'.pcaReform.log'
        pcaReformCommand += ' -J PCA_reform_'+p+'_'+c
        #smartpca output
        pcaReformCommand += ' python3 genPCA.py ' + samplePops[p][c]+'.pca.evec'
        #PCA file
        pcaReformCommand += ' ' + PCAFile[p][c]
        #sample master file
        pcaReformCommand += ' ' + sampleMasterFile
        #bcf file directory
        pcaReformCommand += ' ' + bcfDir
        os.system(pcaReformCommand)
        print('job submmited for generating PCA file: '+p+' '+c)

Job <174336285> is submitted to queue <standard>.
job submmited for generating PCA file: yri sjlife.ccss


## Tools used for prs computing

In [18]:
%%writefile genotypeGen.py
#!/usr/bin/python3

import sys
import re

snpfile = sys.argv[1]
gtfile = sys.argv[2]
samplefile = sys.argv[3]
outfile = sys.argv[4]


snpfh = open(snpfile)
snpPos2RSID = {}
for line in snpfh:
	l = line.replace('\n','').split('\t')
	varpos = '.'.join([l[1],l[3]])
	snpPos2RSID[varpos] = l[0]


samples = [x.strip().split('\t')[0] for x in open(samplefile)]
out = open(outfile,'w')
gtfh = open(gtfile)
for line in gtfh:
	l = re.split('\s+',line.strip())
	vid = '.'.join(l[0:2])
	rsid = snpPos2RSID[vid]
	for idx,gt in enumerate(l[4:]):
		gtl = gt.split('/')
		refCount = str(gtl.count('0'))
		out.write('\t'.join([rsid,samples[idx],refCount])+'\n')
gtfh.close()
out.close()

Overwriting genotypeGen.py


In [19]:
%%writefile genotypeExtract.py
#!/usr/bin/python3

import sys
import os
import subprocess as sp

snpfile = sys.argv[1]
samplefile = sys.argv[2]
bcfDir = sys.argv[3]
sampleMasterFile = sys.argv[4]
out = sys.argv[5]

#generate snp position file for bcftools query
CHR = [str(x+1) for x in range(22)]
snpPos = {x:samplefile+'.snppos.'+x for x in CHR}
snpPosOut = {x:open(snpPos[x],'w') for x in snpPos}
snpfh = open(snpfile)
for line in snpfh:
	l = line.replace('\n','').split('\t')
	snpPosOut[l[1]].write('\t'.join([l[1],l[3]])+'\n')
snpfh.close()
for e in snpPosOut:
	snpPosOut[e].close()
	print(e,snpPos[e])
##convert sample name to intID
def SAM2ID(bcf_dir,samfile):
    samples = [x.strip() for x in open(samfile)]
    chr1bcf = os.path.join(bcf_dir,'chr1_SJLIFE_CCSS.GT.bcf.gz')
    intID = sp.run('bcftools view -h ' + chr1bcf + '|grep "#CHROM"',shell=True,stdout=sp.PIPE).stdout.decode('utf-8').strip().split('\t')[9:]
    return dict(zip(samples,intID))

id2name = SAM2ID(bcfDir,sampleMasterFile)
samples = [x.strip().split('\t')[0] for x in open(samplefile)]
idsamples = [id2name[x] for x in samples]
samout = open(samplefile+'.intid','w')
for intsam in idsamples:
	samout.write(intsam+'\n')
samout.close()

for e in snpPos:
	chromsome = 'chr' + e
	bcffile = os.path.join(bcfDir,chromsome+'_SJLIFE_CCSS.GT.bcf.gz')
	os.system('bcftools query -S '+samplefile+'.intid -R '+snpPos[e]+' -o '+snpPos[e]+'.gt -f "%CHROM\t%POS\t%REF\t%ALT\t[ %GT ]\n" '+bcffile)
	os.system('cat '+snpPos[e]+'.gt >>'+out)
	os.system('rm -f '+snpPos[e]+'.gt '+snpPos[e])
    

Overwriting genotypeExtract.py


In [4]:
%%writefile genPCA.py
#!/usr/bin/python3


import sys
import os
import re
import subprocess as sp


def SAM2ID(bcf_dir,samfile):
    samples = [x.strip() for x in open(samfile)]
    chr1bcf = os.path.join(bcf_dir,'chr1_SJLIFE_CCSS.GT.bcf.gz')
    intID = sp.run('bcftools view -h ' + chr1bcf + '|grep "#CHROM"',shell=True,stdout=sp.PIPE).stdout.decode('utf-8').strip().split('\t')[9:]
    return dict(zip(samples,intID))

smartpcaOutFile = sys.argv[1]
outFile = sys.argv[2]
sampleMasterFile = sys.argv[3]
bcfDir = sys.argv[4]

id2name = SAM2ID(bcfDir,sampleMasterFile)
fh = open(smartpcaOutFile)
fh.readline()
out = open(outFile,'w')

for line in fh:
	l = re.split('\s+', line.strip())
	samID = id2name[l[0]]
	out.write('\t'.join([samID]+l[1:-1])+'\n')
out.close()
fh.close()

Writing genPCA.py
