Skip to content

Commit

Permalink
Daily build
Browse files Browse the repository at this point in the history
  • Loading branch information
scikal committed Aug 29, 2021
1 parent 545aa86 commit d02ffd7
Show file tree
Hide file tree
Showing 8 changed files with 610 additions and 315 deletions.
178 changes: 95 additions & 83 deletions ANEUPLOIDY_TEST.py

Large diffs are not rendered by default.

63 changes: 35 additions & 28 deletions DISTANT_ADMIXTURE_MODELS.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@
# -*- coding: utf-8 -*-

"""
COMPLEX_ADMIXTURE_MODELS
DISTANT_ADMIXTURE_MODELS
Given reads that originated form the same genomic window and a reference panel
of two populations, the likelihood of observed reads under four scenarios,
namely, monosomy, disomy, SPH and BPH is calculated. This module is for complex
namely, monosomy, disomy, SPH and BPH is calculated. This module is for distant
admixtures, where each descendant haplotype has a certain probability to
originate from one of two ancestral populations.
Expand All @@ -18,7 +18,7 @@
Aug 10, 2021
"""

import pickle, os, sys, bz2, collections
import pickle, os, sys, bz2, collections, gzip

from functools import reduce
from operator import and_, itemgetter
Expand All @@ -37,12 +37,11 @@ def popcount(x):
""" Counts non-zero bits in positive integer. """
return bin(x).count('1')

class complex_admixture:
""" Based on two IMPUTE2 arrays, which contain the legend and haplotypes,
and a dictionary with statisitcal models (models_dict), it allows to
calculate the likelihoods of observed alleles under various statistical
models (monosomy, disomy, SPH and BPH). """

class distant_admixture:
""" Based on the statisitcal models (models_dict) and the reference panel
(leg_tab, hap_tab and sam_tab), it allows to calculate the likelihoods of
observed alleles under various statistical models (monosomy, disomy, SPH
and BPH). """

def __init__(self, obs_tab, leg_tab, hap_tab, sam_tab, models_dict, total_number_of_haplotypes, admixture):
""" Initialize the attributes of the class. """
Expand Down Expand Up @@ -101,7 +100,7 @@ def build_hap_dict(self, obs_tab, leg_tab, hap_tab):

fraction_of_matches = 1-mismatches/len(obs_tab)

print('Algorithm for complex admixtures: %.2f%% of the observed alleles matched the reference panel.' % (100*fraction_of_matches))
print('Algorithm for distant admixtures: %.2f%% of the observed alleles matched the reference panel.' % (100*fraction_of_matches))

return hap_dict, fraction_of_matches

Expand Down Expand Up @@ -288,34 +287,42 @@ def get_likelihoods(self, *x):
result = self.likelihoods(*x)
return result

def wrapper_of_complex_admixture_for_debugging(obs_filename,leg_filename,hap_filename,sample_filename,models_filename,admixture):
""" Wrapper function of the class complex_admixture. It receives an observations
file, IMPUTE2 legend file, IMPUTE2 haplotypes file, IMPUTE2 samples file,
and a file with four statistical models. Based on the given data it creates
and returns an instance of the class. """

from MAKE_OBS_TAB import read_impute2
def wrapper_of_distant_admixture_for_debugging(obs_filename,leg_filename,hap_filename,sample_filename,models_filename,admixture):
""" Wrapper function of the class 'distant_admixture'. It receives an
observations file, legend file, haplotypes file, samples file and a file
with the statistical models. Based on the given data it creates and returns
an instance of the class. """

if not os.path.isfile(obs_filename): raise Exception('Error: OBS file does not exist.')
if not os.path.isfile(leg_filename): raise Exception('Error: LEGEND file does not exist.')
if not os.path.isfile(hap_filename): raise Exception('Error: HAP file does not exist.')
if not os.path.isfile(sample_filename): raise Exception('Error: SAMPLE file does not exist.')
if not os.path.isfile(models_filename): raise Exception('Error: MODELS file does not exist.')

leg_tab = read_impute2(leg_filename, filetype='leg')
hap_tab, total_number_of_haplotypes = read_impute2(hap_filename, filetype='hap')
sam_tab = read_impute2(sample_filename, filetype='sam')
load = lambda filename: {'bz2': bz2.open, 'gz': gzip.open}.get(filename.rsplit('.',1)[1], open) #Adjusts the opening method according to the file extension.

open_hap = load(hap_filename)
with open_hap(hap_filename,'rb') as hap_in:
hap_tab, total_number_of_haplotypes = pickle.load(hap_in)

open_leg = load(leg_filename)
with open_leg(leg_filename,'rb') as leg_in:
leg_tab = pickle.load(leg_in)

open_samp = load(sample_filename)
with open_samp(sample_filename,'rb') as samp_in:
sam_tab = pickle.load(samp_in)

load_obs = bz2.BZ2File if obs_filename[-6:]=='.p.bz2' else open
with load_obs(obs_filename, 'rb') as f:
obs_tab = pickle.load(f)
open_obs = load(obs_filename)
with open_obs(obs_filename, 'rb') as obs_in:
obs_tab = pickle.load(obs_in)
#info = pickle.load(f)

load_model = bz2.BZ2File if models_filename[-6:]=='.p.bz2' else open
with load_model(models_filename, 'rb') as f:
models_dict = pickle.load(f)
open_model = load(models_filename)
with open_model(models_filename, 'rb') as model_in:
models_dict = pickle.load(model_in)

return complex_admixture(obs_tab, leg_tab, hap_tab, sam_tab, models_dict, total_number_of_haplotypes, admixture)
return distant_admixture(obs_tab, leg_tab, hap_tab, sam_tab, models_dict, total_number_of_haplotypes, admixture)

if __name__ != "__main__":
print('The module COMPLEX_ADMIXTURE_MODELS was imported.')
Expand All @@ -341,7 +348,7 @@ def wrapper_of_complex_admixture_for_debugging(obs_filename,leg_filename,hap_fil
models_filename = 'MODELS/MODELS16.p'
admixture = admix_tuple('EUR',0.8)
A = wrapper_of_complex_admixture_for_debugging(obs_filename,leg_filename,hap_filename,sam_filename,models_filename,admixture)
A = wrapper_of_distant_admixture_for_debugging(obs_filename,leg_filename,hap_filename,sam_filename,models_filename,admixture)
alleles = tuple(A.hap_dict.keys())
Expand Down
119 changes: 119 additions & 0 deletions EXTRACT_GENOTYPES.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
EXTRACT_GENOTYPES
Simulates an observation table, obs_tab of a haploid, using phased genotypes from a LD-PGTA reference panel.
Daniel Ariad (daniel@ariad.org)
Jan 13, 2021
"""
import pickle, os, sys, time, argparse, random, gzip, collections

leg_tuple = collections.namedtuple('leg_tuple', ('chr_id', 'pos', 'ref', 'alt')) #Encodes the rows of the legend table
sam_tuple = collections.namedtuple('sam_tuple', ('sample_id', 'group1', 'group2', 'sex')) #Encodes the rows of the samples table
obs_tuple = collections.namedtuple('obs_tuple', ('pos', 'read_id', 'base')) #Encodes the rows of the observations table

def get_haplotypes(sample_filename, hap_filename, sample_id):
""" Extracts haplotypes that correspond to a specific sample ID. """

with gzip.open(sample_filename, 'rb') as sam_in:
SAM = pickle.load(sam_in)

samples = [s.sample_id for s in SAM]

if sample_id in samples:
ind = samples[::-1].index(sample_id)
else:
raise Exception('Error: sample_id not found.')

a = -2*(ind+1)
b = None if ind==0 else -2*(ind+1)+2

#print(samples[-(ind+1)])
string2tuple = {'00': (0,0), '01': (0,1), '10': (1,0), '11': (1,1), '': (0,0), '0': (0,0), '1': (0,1)}
with gzip.open(hap_filename,'rb') as hap_in:
hap_tab, number_of_haplotypes = pickle.load(hap_in)
result = [string2tuple[bin(h)[2:][a:b]] for h in hap_tab]

return result

def extract(leg_filename,hap_filename,samp_filename,chr_id,sample_id,**kwargs):
""" Builds an observation tables of effective haploids by extracting
phased genotypes from a LD-PGTA reference panel. """

a = time.time()
random.seed(None,version=2)

genotypes = kwargs.get('genotypes', 'AB')

output_dir = kwargs.get('output_dir', '')
if output_dir!='' and not os.path.exists(output_dir): os.makedirs(output_dir)
output_dir += '/' if output_dir[-1:]!='/' else ''

haplotypes = get_haplotypes(samp_filename, hap_filename, sample_id)

with gzip.open(leg_filename,'rb') as leg_in:
legend = pickle.load(leg_in)

info = {'chr_id': chr_id,
'depth': 1,
'read_length': 1,
'sample_id': sample_id}

if genotypes in ('A','AB'):
obs_tab1 = tuple(obs_tuple(pos, 'XXX', alt if allele1 else ref)
for (chrID,pos,ref,alt),(allele1,allele2) in zip(legend,haplotypes)
if chr_id==chrID)

with open(output_dir+sample_id+'A.%s.hg38.obs.p' % chr_id, 'wb') as binfile:
info1 = {**info, 'haplotype': 'A'}
pickle.dump(obs_tab1, binfile, protocol=4)
pickle.dump(info1 , binfile, protocol=4)

if genotypes in ('B','AB'):
obs_tab2 = tuple(obs_tuple(pos, 'XXX', alt if allele2 else ref)
for (chrID,pos,ref,alt),(allele1,allele2) in zip(legend,haplotypes)
if chr_id==chrID)

with open(output_dir+sample_id+'B.%s.hg38.obs.p' % chr_id, 'wb') as binfile:
info2 = {**info, 'haplotype': 'B'}
pickle.dump(obs_tab2, binfile, protocol=4)
pickle.dump(info2, binfile, protocol=4)

b = time.time()
print('Done in %.3f sec.' % ((b-a)))

return 0

if __name__ == "__main__":

parser = argparse.ArgumentParser( description='Simulates two observation tables of haploids, using phased genotypes from a LD-PGTA reference panel. ')

parser.add_argument('leg_filename', metavar='legend_filename', type=str,
help='IMPUTE2 legend file')
parser.add_argument('hap_filename', metavar='haplotypes_filename', type=str,
help='IMPUTE2 haplotypes file')
parser.add_argument('samp_filename', metavar='samples_filename', type=str,
help='IMPUTE2 samples file')
parser.add_argument('chr_id', metavar='chromosomeID', type=str,
help='Chromosome ID')
parser.add_argument('sample_id', metavar='sampleID', type=str,
help='Sample ID')
parser.add_argument('-g', '--genotypes', metavar='A/B/AB', type=str, default='AB',
help='Which of the individual\'s haplotypes should be used. For each specified haplotype, one haploid would be genereated. Default is both (AB).')


args = parser.parse_args()
sys.exit(extract(**vars(args)))


def test():
sample_id = 'HG00097'
chr_id = 'chr21'
leg_filename = f'EUR_panel.hg38/{chr_id:s}_EUR_panel.legend.gz'
hap_filename = f'EUR_panel.hg38/{chr_id:s}_EUR_panel.hap.gz'
samp_filename = 'EUR_panel.hg38/EUR_panel.samples.gz'

work_dir='results_TEMP'
return extract(leg_filename,hap_filename,samp_filename,chr_id,sample_id,output_dir=work_dir)
Loading

0 comments on commit d02ffd7

Please sign in to comment.