From d02ffd7a4522a94c3556001ef92998e82cce99bc Mon Sep 17 00:00:00 2001 From: Daniel Ariad Date: Sun, 29 Aug 2021 00:11:29 -0400 Subject: [PATCH] Daily build --- ANEUPLOIDY_TEST.py | 178 +++++++++++++++++++----------------- DISTANT_ADMIXTURE_MODELS.py | 63 +++++++------ EXTRACT_GENOTYPES.py | 119 ++++++++++++++++++++++++ F1_ADMIXTURE_MODELS.py | 157 ++++++++++++++++--------------- HOMOGENOUES_MODELS.py | 38 ++++---- MAKE_OBS_TAB.py | 51 ++--------- MAKE_REF_PANEL.py | 175 +++++++++++++++++++++++++++++++++++ MIX_HAPLOIDS.py | 144 ++++++++++++++--------------- 8 files changed, 610 insertions(+), 315 deletions(-) create mode 100644 EXTRACT_GENOTYPES.py create mode 100644 MAKE_REF_PANEL.py diff --git a/ANEUPLOIDY_TEST.py b/ANEUPLOIDY_TEST.py index b34cda2..7a53cde 100644 --- a/ANEUPLOIDY_TEST.py +++ b/ANEUPLOIDY_TEST.py @@ -15,10 +15,9 @@ """ import collections, time, pickle, argparse, re, sys, random, os, bz2, gzip -from MAKE_OBS_TAB import read_impute2 from HOMOGENOUES_MODELS import homogeneous from F1_ADMIXTURE_MODELS import f1_admixture -from COMPLEX_ADMIXTURE_MODELS import complex_admixture +from DISTANT_ADMIXTURE_MODELS import distant_admixture from itertools import product, starmap @@ -53,20 +52,20 @@ def comb(n, k): for t in range(min(k, n-k)): b *= n b //= t+1 - n -= 1 + n -= 1 return b def mean_and_var(data): """ Calculates the mean and variance. """ m = mean(data) var = variance(data, xbar=m) - return m, var + return m, var def mean_and_std(data): """ Calculates the mean and population standard deviation. """ m = mean(data) std = pstdev(data, mu=m) - return m, std + return m, std def summarize(M,V): """ Calculates chromosome-wide statistics of the LLRs """ @@ -80,13 +79,13 @@ def LLR(y,x): if x and y: result = log(y/x) elif x and not y: - result = -1.23456789 + result = -1.23456789 elif not x and y: - result = +1.23456789 + result = +1.23456789 elif not x and not y: - result = 0 + result = 0 else: - result = None + result = None return result def invert(x,n): @@ -99,24 +98,24 @@ def build_combined(leg_tab,hap_tab): panel. """ combined = {pos: comb_tuple(ref,alt,hap) for (chr_id,pos,ref,alt),hap in zip(leg_tab, hap_tab)} return combined - + def build_reads_dict(obs_tab,combined_dict): """ Returns a dictionary that lists read IDs of reads that overlap with SNPs and gives the alleles in each read. """ reads = collections.defaultdict(list) - + for pos, read_id, base in obs_tab: if pos in combined_dict and (base==combined_dict[pos].ref or base==combined_dict[pos].alt): reads[read_id].append((pos,base)) - + return reads - + def build_score_dict(reads_dict,combined_dict,number_of_haplotypes,min_HF): """ Returns a dicitonary lists read_IDs and gives their score. The scoring algorithm scores each read according to the number of differet haplotypes that the reference panel supports at the chromosomal region that overlaps - with the read. Only bialleic SNP with a minor allele frequeancy above + with the read. Only bialleic SNP with a minor allele frequeancy above 0.01 are considered for the calculation, since they are unlikely to affect the score. In addition, only haplotypes with a frequnecy between min_HF and 1-min_HF add to the score of a read. """ @@ -128,8 +127,8 @@ def build_score_dict(reads_dict,combined_dict,number_of_haplotypes,min_HF): for read_id in reads_dict: haplotypes = ((combined_dict[pos].hap, combined_dict[pos].hap ^ b) for pos,base in reads_dict[read_id] - if 0.01 <= popcount(combined_dict[pos].hap)/N <= 0.99) #Include only biallelic SNPs with MAF of at least 0.01. Also, ^b flips all bits of the binary number, hap_tab[ind] using bitwise xor operator. - + if 0.01 <= popcount(combined_dict[pos].hap)/N <= 0.99) #Include only biallelic SNPs with MAF of at least 0.01. Also, ^b flips all bits of the binary number, hap_tab[ind] using bitwise xor operator. + score_dict[read_id] = sum(min_HF <= popcount(reduce(and_,hap))/N <= (1-min_HF) for hap in product(*haplotypes) if len(hap)!=0) @@ -138,7 +137,7 @@ def build_score_dict(reads_dict,combined_dict,number_of_haplotypes,min_HF): def build_aux_dict(obs_tab,combined_dict): """ Returns a dictionary that lists chromosome positions of SNPs and gives a list of read IDs for all the reads that overlap with the SNP. """ - aux_dict = collections.defaultdict(list) ### aux_dict is a + aux_dict = collections.defaultdict(list) ### aux_dict is a for pos, read_id, base in obs_tab: if pos in combined_dict and (base==combined_dict[pos].ref or base==combined_dict[pos].alt): aux_dict[pos].append(read_id) @@ -153,16 +152,16 @@ def iter_windows(obs_tab,combined_dict,score_dict,window_size,offset,min_reads, max_dist = 100000 #maximal distance between consecutive observed alleles. max_win_size = 350000 #maximal genomic window size initial_win_size = 50000 #initial genomic window size - + adaptive, window_size = (False, int(window_size)) if window_size else (True, initial_win_size) offset = int(offset) aux_dict = build_aux_dict(obs_tab,combined_dict) - + first, last = obs_tab[0].pos + offset, obs_tab[-1].pos + window_size a, b, readIDs_in_window = first, first+window_size, set() - + for pos, overlapping_reads in aux_dict.items(): - if pos max_reads: eff_subsamples = min(comb(num_of_reads,max_reads),subsamples) elif min_reads <= num_of_reads <= max_reads: eff_subsamples = min(num_of_reads,subsamples) else: eff_subsamples = 0 - + return eff_subsamples - + def bootstrap(obs_tab, leg_tab, hap_tab, sam_tab, number_of_haplotypes, models_dict, window_size, subsamples, offset, min_reads, max_reads, minimal_score, min_HF, ancestral_proportion): """ Applies a bootstrap approach in which: (i) the resample size is smaller than the sample size and (ii) resampling is done without replacement. """ - + min_reads == max(min_reads,3) # Due to the bootstrap approach, min_reads must be at least 3. - max_reads == max(max_reads,2) # Our statistical models require at least 2 reads. - + max_reads == max(max_reads,2) # Our statistical models require at least 2 reads. + combined_dict = build_combined(leg_tab, hap_tab) reads_dict = build_reads_dict(obs_tab, combined_dict) score_dict = build_score_dict(reads_dict, combined_dict, number_of_haplotypes, min_HF) - windows_dict = dict(iter_windows(obs_tab, combined_dict, score_dict, window_size, offset, min_reads, max_reads, minimal_score)) - + windows_dict = dict(iter_windows(obs_tab, combined_dict, score_dict, window_size, offset, min_reads, max_reads, minimal_score)) + ancestry = {row.group2 for row in sam_tab} if len(ancestry)==2 and 016 else ('MODELS16.p' if max_reads>12 else 'MODELS12.p'))) ancestral_proportion = (lambda x,y: admix_tuple(str(x),float(y)))(*kwargs.get('ancestral_proportion',('None','-1'))) - Open = {'bz2': bz2.open, 'gzip': gzip.open}.get(obs_filename.rpartition('.')[-1], open) - with Open(obs_filename, 'rb') as f: - obs_tab = pickle.load(f) - info = pickle.load(f) - - leg_tab = read_impute2(leg_filename, filetype='leg') - hap_tab, number_of_haplotypes = read_impute2(hap_filename, filetype='hap') - sam_tab = read_impute2(sam_filename, filetype='sam') - + load = lambda filename: {'bz2': bz2.open, 'gz': gzip.open}.get(filename.rsplit('.',1)[1], open) #Adjusts the opening method according to the file extension. + + open_hap = load(hap_filename) + with open_hap(hap_filename,'rb') as hap_in: + hap_tab, number_of_haplotypes = pickle.load(hap_in) + + open_leg = load(leg_filename) + with open_leg(leg_filename,'rb') as leg_in: + leg_tab = pickle.load(leg_in) + + + open_samp = load(samp_filename) + with open_samp(samp_filename,'rb') as samp_in: + sam_tab = pickle.load(samp_in) + + open_obs = load(obs_filename) + with open_obs(obs_filename, 'rb') as obs_in: + obs_tab = pickle.load(obs_in) + info = pickle.load(obs_in) + + open_model = load(models_filename) + with open_model(models_filename, 'rb') as model_in: + models_dict = pickle.load(model_in) + + ancestry = {row.group2 for row in sam_tab} if len(ancestry)>2: print('warning: individuals in the sample file are associated with more than two populations.') - - load_model = bz2.BZ2File if models_filename[-6:]=='.p.bz2' else open - with load_model(models_filename, 'rb') as f: - models_dict = pickle.load(f) + likelihoods, windows_dict, matched_alleles = bootstrap(obs_tab, leg_tab, hap_tab, sam_tab, number_of_haplotypes, models_dict, window_size, subsamples, offset, min_reads, max_reads, minimal_score, min_HF, ancestral_proportion) - + some_statistics = {'matched_alleles': matched_alleles, 'runtime': time.time()-time0} - + info.update({'ancestry': ancestry, 'window_size': window_size, 'subsamples': subsamples, @@ -334,18 +346,18 @@ def aneuploidy_test(obs_filename,leg_filename,hap_filename,sam_filename, 'min_HF': min_HF, 'statistics': {**statistics(likelihoods,windows_dict), **some_statistics} }) - + if len(ancestry)==2 and 02: - BPH += sum( sum(F[B0] * sum(( (F[B1] * G[B2] + G[B1] * F[B2]) / M + G[B1] * G[B2] / N) + BPH += sum( sum(F[B0] * sum(( (F[B1] * G[B2] + G[B1] * F[B2]) / M + G[B1] * G[B2] / N) for (B1, B2) in C[B0]) for B0 in C) * A0 / A1 for (A0, A1), C in models['BPH'][3].items()) / (6 * M * N) - - BPH += sum( sum(G[B0] * sum(((F[B1] * G[B2] + G[B1] * F[B2]) / N + F[B1] * F[B2] / M) + + BPH += sum( sum(G[B0] * sum(((F[B1] * G[B2] + G[B1] * F[B2]) / N + F[B1] * F[B2] / M) for (B1, B2) in C[B0]) for B0 in C) * A0 / A1 for (A0, A1), C in models['BPH'][3].items()) / (6 * M * N) ### SPH ### (((A0, A1),((B0,),)),) = models['SPH'][1].items() - SPH = (F[B0] / M + G[B0] / N) * A0 / ( 2 * A1 ) + SPH = (F[B0] / M + G[B0] / N) * A0 / ( 2 * A1 ) SPH += sum( sum((F[B0] * G[B1] + G[B0] * F[B1]) for (B0, B1) in C) * A0 / A1 for (A0, A1), C in models['SPH'][2].items()) / ( 2 * M * N) - + ### DIPLOIDY ### (((A0, A1),((B0,),)),) = models['DISOMY'][1].items() DISOMY = ( F[B0] / M + G[B0] / N ) * A0 / ( 2 * A1 ) @@ -213,18 +213,18 @@ def likelihoods(self, *alleles): ### MONOSOMY ### ((B0,),) = models['MONOSOMY'][1][(1,1)] - MONOSOMY = ( F[B0] / M + G[B0] / N ) / 2 - + MONOSOMY = ( F[B0] / M + G[B0] / N ) / 2 + result = (MONOSOMY, DISOMY, SPH, BPH) return result - + def likelihoods2(self, *alleles): """ Calculates the likelihood to observe two alleles/haplotypes under four scenarios, namely, monosomy, disomy, SPH and BPH. """ - - F = self.joint_frequencies_combo(*alleles, group2_id=0, normalize=True) - G = self.joint_frequencies_combo(*alleles, group2_id=1, normalize=True) + + F = self.joint_frequencies_combo(*alleles, group2_id=0, normalize=True) + G = self.joint_frequencies_combo(*alleles, group2_id=1, normalize=True) a, b, ab = F[1], F[2], F[3] A, B, AB = G[1], G[2], G[3] BPH = (2*(b*a+B*A)+3*(AB+ab)+4*(b*A+B*a))/18 #The likelihood of three unmatched haplotypes. #V @@ -236,9 +236,9 @@ def likelihoods2(self, *alleles): def likelihoods3(self, *alleles): """ Calculates the likelihood to observe three alleles/haplotypes under four scenarios, namely, monosomy, disomy, SPH and BPH. """ - - F = self.joint_frequencies_combo(*alleles, group2_id=0, normalize=True) - G = self.joint_frequencies_combo(*alleles, group2_id=1, normalize=True) + + F = self.joint_frequencies_combo(*alleles, group2_id=0, normalize=True) + G = self.joint_frequencies_combo(*alleles, group2_id=1, normalize=True) a, b, ab, c, ac, bc, abc = F[1], F[2], F[3], F[4], F[5], F[6], F[7] A, B, AB, C, AC, BC, ABC = G[1], G[2], G[3], G[4], G[5], G[6], G[7] @@ -248,23 +248,23 @@ def likelihoods3(self, *alleles): DISOMY = (abc+ab*C+ac*B+bc*A+ABC+AB*c+AC*b+BC*a)/8 #The likelihood of diploidy. #V MONOSOMY = (abc+ABC)/2 #The likelihood of monosomy. #V return MONOSOMY, DISOMY, SPH, BPH - + def likelihoods4(self, *alleles): """ Calculates the likelihood to observe four alleles/haplotypes under four scenarios, namely, monosomy, disomy, SPH and BPH. """ - - F = self.joint_frequencies_combo(*alleles, group2_id=0, normalize=True) - G = self.joint_frequencies_combo(*alleles, group2_id=1, normalize=True) + + F = self.joint_frequencies_combo(*alleles, group2_id=0, normalize=True) + G = self.joint_frequencies_combo(*alleles, group2_id=1, normalize=True) a, b, c, d = F[1], F[2], F[4], F[8], ab, ac, ad, bc, bd, cd = F[3], F[5], F[9], F[6], F[10], F[12] abc, abd, acd, bcd = F[7], F[11], F[13], F[14] abcd = F[15] - + A, B, C, D = G[1], G[2], G[4], G[8], AB, AC, AD, BC, BD, CD = G[3], G[5], G[9], G[6], G[10], G[12] ABC, ABD, ACD, BCD = G[7], G[11], G[13], G[14] ABCD = G[15] - + BPH = (2*(AB*CD+AC*BD+AD*BC+ A*(BCD+B*cd+b*CD+b*cd+C*bd+c*BD+c*bd+D*bc+d*BC+d*bc)+ B*(ACD+C*ad+c*AD+c*ad+D*ac+d*AC+d*ac)+ @@ -280,12 +280,12 @@ def likelihoods4(self, *alleles): DISOMY = (abcd+abc*D+bcd*A+acd*B+abd*C+ab*CD+ad*BC+ac*BD+ABCD+ABC*d+BCD*a+ACD*b+ABD*c+AB*cd+AD*bc+AC*bd)/16 #The likelihood of diploidy. #V MONOSOMY = (abcd+ABCD)/2 #The likelihood of monosomy. #V return MONOSOMY, DISOMY, SPH, BPH - + def get_likelihoods(self, *x): - """ Uses the optimal function to calculate the likelihoods. + """ Uses the optimal function to calculate the likelihoods. In general, self.likelihoods can get less than five alleles but the dedicated functions are optimized to a certain number of alleles. """ - + l = len(x) if l==2: result = self.likelihoods2(*x) @@ -293,17 +293,15 @@ def get_likelihoods(self, *x): result = self.likelihoods3(*x) elif l==4: result = self.likelihoods4(*x) - else: + else: result = self.likelihoods(*x) return result - -def wrapper_of_f1_admixture_for_debugging(obs_filename,leg_filename,hap_filename,sample_filename,models_filename): - """ Wrapper function of the class f1_admixture. It receives an observations - file, IMPUTE2 legend file, IMPUTE2 haplotypes file, IMPUTE2 samples file, - and a file with four statistical models. Based on the given data it creates - and returns an instance of the class. """ - from MAKE_OBS_TAB import read_impute2 +def wrapper_of_f1_admixture_for_debugging(obs_filename,leg_filename,hap_filename,sample_filename,models_filename): + """ Wrapper function of the class 'f1_admixture'. It receives an observations + file, legend file, haplotypes file, samples file and a file with the + statistical models. Based on the given data it creates and returns an + instance of the class. """ if not os.path.isfile(obs_filename): raise Exception('Error: OBS file does not exist.') if not os.path.isfile(leg_filename): raise Exception('Error: LEGEND file does not exist.') @@ -311,18 +309,29 @@ def wrapper_of_f1_admixture_for_debugging(obs_filename,leg_filename,hap_filename if not os.path.isfile(sample_filename): raise Exception('Error: SAMPLE file does not exist.') if not os.path.isfile(models_filename): raise Exception('Error: MODELS file does not exist.') - leg_tab = read_impute2(leg_filename, filetype='leg') - hap_tab, total_number_of_haplotypes = read_impute2(hap_filename, filetype='hap') - sam_tab = read_impute2(sample_filename, filetype='sam') - - load_obs = bz2.BZ2File if obs_filename[-6:]=='.p.bz2' else open - with load_obs(obs_filename, 'rb') as f: - obs_tab = pickle.load(f) - #info = pickle.load(f) + load = lambda filename: {'bz2': bz2.open, 'gz': gzip.open}.get(filename.rsplit('.',1)[1], open) #Adjusts the opening method according to the file extension. + + open_hap = load(hap_filename) + with open_hap(hap_filename,'rb') as hap_in: + hap_tab, total_number_of_haplotypes = pickle.load(hap_in) + + open_leg = load(leg_filename) + with open_leg(leg_filename,'rb') as leg_in: + leg_tab = pickle.load(leg_in) + - load_model = bz2.BZ2File if models_filename[-6:]=='.p.bz2' else open - with load_model(models_filename, 'rb') as f: - models_dict = pickle.load(f) + open_samp = load(sample_filename) + with open_samp(sample_filename,'rb') as samp_in: + sam_tab = pickle.load(samp_in) + + open_obs = load(obs_filename) + with open_obs(obs_filename, 'rb') as obs_in: + obs_tab = pickle.load(obs_in) + #info = pickle.load(obs_in) + + open_model = load(models_filename) + with open_model(models_filename, 'rb') as model_in: + models_dict = pickle.load(model_in) return f1_admixture(obs_tab, leg_tab, hap_tab, sam_tab, models_dict, total_number_of_haplotypes) @@ -348,7 +357,7 @@ def wrapper_of_f1_admixture_for_debugging(obs_filename,leg_filename,hap_filename leg_filename = '../build_reference_panel/EAS_EUR_panel.hg38.BCFtools/chr6_EAS_EUR_panel.legend.gz' sam_filename = '../build_reference_panel/samples_per_panel/EAS_EUR_panel.samples' models_filename = 'MODELS/MODELS16.p' - + A = wrapper_of_f1_admixture_for_debugging(obs_filename,leg_filename,hap_filename,sam_filename,models_filename) alleles = tuple(A.hap_dict.keys()) @@ -365,7 +374,7 @@ def wrapper_of_f1_admixture_for_debugging(obs_filename,leg_filename,hap_filename x = random.randrange(len(alleles)-16) haplotypes = (alleles[x:x+4],alleles[x+4:x+8],alleles[x+8:x+12],alleles[x+12:x+16]) - print('-----joint_frequencies_combo-----') + print('-----joint_frequencies_combo-----') print(frequencies0(alleles[x+0])) print(frequencies0(*alleles[x:x+4])) print(frequencies1(alleles[x+0])) @@ -399,4 +408,4 @@ def wrapper_of_f1_admixture_for_debugging(obs_filename,leg_filename,hap_filename print('Done in %.3f sec.' % ((t1-t0))) -""" \ No newline at end of file +""" diff --git a/HOMOGENOUES_MODELS.py b/HOMOGENOUES_MODELS.py index 68c341e..1c3393b 100644 --- a/HOMOGENOUES_MODELS.py +++ b/HOMOGENOUES_MODELS.py @@ -16,7 +16,7 @@ Dec 21, 2020 """ -import pickle, os, sys, bz2, collections +import pickle, os, sys, bz2, collections, gzip from functools import reduce from operator import and_, itemgetter @@ -35,10 +35,9 @@ def popcount(x): return bin(x).count('1') class homogeneous: - """ Based on two IMPUTE2 arrays, which contain the legend and haplotypes, - and a dictionary with statisitcal models (models_dict), it allows to - calculate the likelihoods of observed alleles under various statistical - models (monosomy, disomy, SPH and BPH). """ + """ Based on the statisitcal models (models_dict) and the reference panel + (leg_tab, hap_tab and sam_tab), it allows to calculate the likelihoods of + observed alleles under various statistical models (monosomy, disomy, SPH and BPH). """ def __init__(self, obs_tab, leg_tab, hap_tab, sam_tab, models_dict, number_of_haplotypes): @@ -241,29 +240,34 @@ def get_likelihoods(self, *x): return result def wrapper_of_homogenoues_for_debugging(obs_filename,leg_filename,hap_filename,models_filename): - """ Wrapper function of the class homogeneous. It receives an observations - file, IMPUTE2 legend file, IMPUTE2 haplotypes file, and a file with four + """ Wrapper function of the class 'homogeneous'. It receives an observations + file, legend file, haplotypes file, samples file and a file with the statistical models. Based on the given data it creates and returns an instance of the class. """ - from MAKE_OBS_TAB import read_impute2 - if not os.path.isfile(obs_filename): raise Exception('Error: OBS file does not exist.') if not os.path.isfile(leg_filename): raise Exception('Error: LEGEND file does not exist.') if not os.path.isfile(hap_filename): raise Exception('Error: HAP file does not exist.') if not os.path.isfile(models_filename): raise Exception('Error: MODELS file does not exist.') - leg_tab = read_impute2(leg_filename, filetype='leg') - hap_tab, number_of_haplotypes = read_impute2(hap_filename, filetype='hap') + load = lambda filename: {'bz2': bz2.open, 'gz': gzip.open}.get(filename.rsplit('.',1)[1], open) #Adjusts the opening method according to the file extension. + + open_hap = load(hap_filename) + with open_hap(hap_filename,'rb') as hap_in: + hap_tab, number_of_haplotypes = pickle.load(hap_in) + + open_leg = load(leg_filename) + with open_leg(leg_filename,'rb') as leg_in: + leg_tab = pickle.load(leg_in) - load_obs = bz2.BZ2File if obs_filename[-6:]=='.p.bz2' else open - with load_obs(obs_filename, 'rb') as f: - obs_tab = pickle.load(f) + open_obs = load(obs_filename) + with open_obs(obs_filename, 'rb') as obs_in: + obs_tab = pickle.load(obs_in) #info = pickle.load(f) - load_model = bz2.BZ2File if models_filename[-6:]=='.p.bz2' else open - with load_model(models_filename, 'rb') as f: - models_dict = pickle.load(f) + open_model = load(models_filename) + with open_model(models_filename, 'rb') as model_in: + models_dict = pickle.load(model_in) return homogeneous(obs_tab, leg_tab, hap_tab, None, models_dict, number_of_haplotypes) diff --git a/MAKE_OBS_TAB.py b/MAKE_OBS_TAB.py index 95e9c30..19626da 100644 --- a/MAKE_OBS_TAB.py +++ b/MAKE_OBS_TAB.py @@ -4,8 +4,8 @@ MAKE_OBS_TAB This script extracts single base observations at SNP positions from a given -sequence. It requires an aligned and sorted BAM file with the sequence, as well -as an IMPUTE2 legend format, which contains the SNPs positions. +sequence. It requires a BAM file that is sorted according to position, as well +as a legend format, which contains the SNPs positions. The observed alleles, together with their associated read ID, chromosome position and line number in the legend file, are organized in a table. @@ -24,42 +24,6 @@ except ModuleNotFoundError: print('Caution: The module pysam is missing.') -def read_impute2(filename,**kwargs): - """ Reads an IMPUTE2 file format (LEGEND/HAPLOTYPE/SAMPLE) and builds a list - of lists, containing the dataset. """ - - filetype = kwargs.get('filetype', None) - - def leg_format(line): - rs_id, pos, ref, alt = line.strip().split() - return leg_tuple('chr'+rs_id[:2].rstrip(':'), int(pos), ref, alt) - - def sam_format(line): - sample_id, group1, group2, sex = line.strip().split(' ') - return sam_tuple(sample_id, group1, group2, int(sex)) - - with (gzip.open(filename,'rt') if filename[-3:]=='.gz' else open(filename, 'r')) as impute2_in: - if filetype == 'leg': - impute2_in.readline() # Bite off the header - result = tuple(map(leg_format,impute2_in)) - - elif filetype == 'hap': - firstline = impute2_in.readline() # Get first line - a0 = int(firstline.replace(' ', ''), 2) - a1 = (int(line.replace(' ', ''), 2) for line in impute2_in) - hap_tab = (a0, *a1) - number_of_haplotypes = len(firstline.strip().split()) - result = hap_tab, number_of_haplotypes - - elif filetype == 'sam': - impute2_in.readline() # Bite off the header - result = tuple(map(sam_format,impute2_in)) - - else: - result = tuple(line.strip().split() for line in impute2_in) - - return result - def save_obs(obs_tab,info,compress,bam_filename,output_filename,output_dir): """ Saves the observations table together with information about the chromosome number, depth of coverage, and flags that were used. @@ -92,7 +56,12 @@ def retrive_bases(bam_filename,legend_filename,fasta_filename,handle_multiple_ob try: genome_reference = pysam.FastaFile(fasta_filename) if fasta_filename!='' else None samfile = pysam.AlignmentFile(bam_filename, 'rb' ) - leg_tab = read_impute2(legend_filename, filetype='leg') + + load = lambda filename: {'bz2': bz2.open, 'gz': gzip.open}.get(filename.rsplit('.',1)[1], open) #Adjusts the opening method according to the file extension. + open_leg = load(legend_filename) + with open_leg(legend_filename,'rb') as leg_in: + leg_tab = pickle.load(leg_in) + if next(zip(*leg_tab)).count(leg_tab[0][0])!=len(leg_tab): raise Exception('Error: Unsuitable legend file. All SNP positions should refer to the same chr_id.') @@ -163,9 +132,9 @@ def retrive_bases(bam_filename,legend_filename,fasta_filename,handle_multiple_ob parser = argparse.ArgumentParser( description='Builds a table of single base observations at known SNP positions.') parser.add_argument('bam_filename', metavar='BAM_FILENAME', type=str, - help='BAM file') + help='A BAM file sorted by position.') parser.add_argument('legend_filename', metavar='LEG_FILENAME', type=str, - help='IMPUTE2 legend file') + help='A legend file of reference panel.') parser.add_argument('-f','--fasta_filename', type=str,metavar='FASTA_FILENAME', default='', help='The faidx-indexed reference file in the FASTA format. ' 'Supplying a reference file will reduce false SNPs caused by misalignments using the Base Alignment Quality (BAQ) method described in the paper “Improving SNP discovery by base alignment quality”, Heng Li, Bioinformatics, Volume 27, Issue 8.') diff --git a/MAKE_REF_PANEL.py b/MAKE_REF_PANEL.py new file mode 100644 index 0000000..51e6939 --- /dev/null +++ b/MAKE_REF_PANEL.py @@ -0,0 +1,175 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +MAKE_REF_PANEL + +This script creates reference panels for LD-PGTA, using genotype calls in VCF +files. The reference panels of LD-PGTA have a similar structure to the IMPUTE2 +format. + +Daniel Ariad (daniel@ariad.org) +AUG 27, 2022 +""" + +import sys, os, time, random, argparse, re, pickle, gzip, bz2, collections, itertools + +leg_tuple = collections.namedtuple('leg_tuple', ('chr_id', 'pos', 'ref', 'alt')) #Encodes the rows of the legend table +sam_tuple = collections.namedtuple('sam_tuple', ('sample_id', 'group1', 'group2', 'sex')) #Encodes the rows of the samples table +obs_tuple = collections.namedtuple('obs_tuple', ('pos', 'read_id', 'base')) #Encodes the rows of the observations table + +try: + import pysam +except ModuleNotFoundError: + print('Caution: The module pysam is missing.') + +def read_impute2(filename,**kwargs): + """ Reads an IMPUTE2 file format (LEGEND/HAPLOTYPE/SAMPLE) and builds a list + of lists, containing the dataset. """ + + filetype = kwargs.get('filetype', None) + + def leg_format(line): + rs_id, pos, ref, alt = line.strip().split() + return leg_tuple('chr'+rs_id[:2].rstrip(':'), int(pos), ref, alt) + + def sam_format(line): + sample_id, group1, group2, sex = line.strip().split(' ') + return sam_tuple(sample_id, group1, group2, int(sex)) + + with (gzip.open(filename,'rt') if filename[-3:]=='.gz' else open(filename, 'r')) as impute2_in: + if filetype == 'leg': + impute2_in.readline() # Bite off the header + result = tuple(map(leg_format,impute2_in)) + + elif filetype == 'hap': + firstline = impute2_in.readline() # Get first line + a0 = int(firstline.replace(' ', ''), 2) + a1 = (int(line.replace(' ', ''), 2) for line in impute2_in) + hap_tab = (a0, *a1) + number_of_haplotypes = len(firstline.strip().split()) + result = hap_tab, number_of_haplotypes + + elif filetype == 'sam': + impute2_in.readline() # Bite off the header + result = tuple(map(sam_format,impute2_in)) + + else: + result = tuple(line.strip().split() for line in impute2_in) + + return result + +def test_module(impute2_leg_filename, impute2_hap_filename, legend, haplotypes): + """ Compares the IMPUTE2 reference panels to LD-PGTA reference panels. """ + impute2_leg = read_impute2(impute2_leg_filename,filetype='leg') + impute2_hap = read_impute2(impute2_hap_filename,filetype='hap') + print('Legend:', all(a==b for a,b in zip(impute2_leg,legend))) + print('Haplotypes:', all(a==b for a,b in zip(impute2_hap[0],haplotypes[0]))) + return 0 + +def build_ref_panel(samp_filename,vcf_filename): + """ Builds a reference panel with similar structure to the IMPUTE2 format. + The reference panel is encoded for efficient storage and retrieval. """ + + time0 = time.time() + + def sam_format(line): + sample_id, group1, group2, sex = line.strip().split(' ') + return sam_tuple(sample_id, group1, group2, int(sex)) + + with (gzip.open(samp_filename,'rt') if samp_filename[-3:]=='.gz' else open(samp_filename, 'r')) as impute2_in: + impute2_in.readline() # Bite off the header + SAMPLES = tuple(map(sam_format,impute2_in)) + + vcf_in = pysam.VariantFile(vcf_filename,'r') # auto-detect input format + print(vcf_in.description) ### Based on the VCF header, prints a description of the VCF file. + + SAM = [s.sample_id for s in SAMPLES if s.sample_id in vcf_in.header.samples] + + lenSAM = len(SAM) ### The number of samples that are also included in the VCF. + + vcf_in.subset_samples(SAM) ### Read only a subset of samples to reduce processing time and memory. Must be called prior to retrieving records. + + HAPLOTYPES = [] + LEGEND = [] + + for record in vcf_in.fetch(): + if record.info["VT"]==('SNP',): ### ### Only encode SNPs + + phased = all((record.samples[sample].phased for sample in SAM)) + if not phased: continue ### Only encode phased SNPs + + ALLELES = tuple(itertools.chain.from_iterable((record.samples[sample].allele_indices for sample in SAM))) + an = ALLELES.count(1) + if an==2*lenSAM or an==0: continue ### Only encode SNPs with a non-zero minor allele count. + + LEGEND.append(leg_tuple('chr'+record.contig, record.pos, *record.alleles)) ### Add the record to the legend list + binary = sum(v<