From d02ffd7a4522a94c3556001ef92998e82cce99bc Mon Sep 17 00:00:00 2001
From: Daniel Ariad <daniel@ariad.org>
Date: Sun, 29 Aug 2021 00:11:29 -0400
Subject: [PATCH] Daily build

---
 ANEUPLOIDY_TEST.py          | 178 +++++++++++++++++++-----------------
 DISTANT_ADMIXTURE_MODELS.py |  63 +++++++------
 EXTRACT_GENOTYPES.py        | 119 ++++++++++++++++++++++++
 F1_ADMIXTURE_MODELS.py      | 157 ++++++++++++++++---------------
 HOMOGENOUES_MODELS.py       |  38 ++++----
 MAKE_OBS_TAB.py             |  51 ++---------
 MAKE_REF_PANEL.py           | 175 +++++++++++++++++++++++++++++++++++
 MIX_HAPLOIDS.py             | 144 ++++++++++++++---------------
 8 files changed, 610 insertions(+), 315 deletions(-)
 create mode 100644 EXTRACT_GENOTYPES.py
 create mode 100644 MAKE_REF_PANEL.py

diff --git a/ANEUPLOIDY_TEST.py b/ANEUPLOIDY_TEST.py
index b34cda2..7a53cde 100644
--- a/ANEUPLOIDY_TEST.py
+++ b/ANEUPLOIDY_TEST.py
@@ -15,10 +15,9 @@
 """
 
 import collections, time, pickle, argparse, re, sys, random, os, bz2, gzip
-from MAKE_OBS_TAB import read_impute2
 from HOMOGENOUES_MODELS import homogeneous
 from F1_ADMIXTURE_MODELS import f1_admixture
-from COMPLEX_ADMIXTURE_MODELS import complex_admixture
+from DISTANT_ADMIXTURE_MODELS import distant_admixture
 
 
 from itertools import product, starmap
@@ -53,20 +52,20 @@ def comb(n, k):
         for t in range(min(k, n-k)):
             b *= n
             b //= t+1
-            n -= 1    
+            n -= 1
         return b
 
 def mean_and_var(data):
     """ Calculates the mean and variance. """
     m = mean(data)
     var = variance(data, xbar=m)
-    return m, var 
+    return m, var
 
 def mean_and_std(data):
     """ Calculates the mean and population standard deviation. """
     m = mean(data)
     std = pstdev(data, mu=m)
-    return m, std 
+    return m, std
 
 def summarize(M,V):
     """ Calculates chromosome-wide statistics of the LLRs """
@@ -80,13 +79,13 @@ def LLR(y,x):
     if x and y:
         result = log(y/x)
     elif x and not y:
-        result = -1.23456789 
+        result = -1.23456789
     elif not x and y:
-        result = +1.23456789 
+        result = +1.23456789
     elif not x and not y:
-        result = 0 
+        result = 0
     else:
-        result = None    
+        result = None
     return result
 
 def invert(x,n):
@@ -99,24 +98,24 @@ def build_combined(leg_tab,hap_tab):
     panel. """
     combined = {pos: comb_tuple(ref,alt,hap) for (chr_id,pos,ref,alt),hap in zip(leg_tab, hap_tab)}
     return combined
-        
+
 def build_reads_dict(obs_tab,combined_dict):
     """ Returns a dictionary that lists read IDs of reads that overlap with
         SNPs and gives the alleles in each read. """
 
     reads = collections.defaultdict(list)
-    
+
     for pos, read_id, base in obs_tab:
         if pos in combined_dict and (base==combined_dict[pos].ref or base==combined_dict[pos].alt):
             reads[read_id].append((pos,base))
-            
+
     return reads
-    
+
 def build_score_dict(reads_dict,combined_dict,number_of_haplotypes,min_HF):
     """ Returns a dicitonary lists read_IDs and gives their score. The scoring
     algorithm scores each read according to the number of differet haplotypes
     that the reference panel supports at the chromosomal region that overlaps
-    with the read. Only bialleic SNP with a minor allele frequeancy above 
+    with the read. Only bialleic SNP with a minor allele frequeancy above
     0.01 are considered for the calculation, since they are unlikely to affect
     the score. In addition, only haplotypes with a frequnecy between min_HF
     and 1-min_HF add to the score of a read. """
@@ -128,8 +127,8 @@ def build_score_dict(reads_dict,combined_dict,number_of_haplotypes,min_HF):
     for read_id in reads_dict:
         haplotypes = ((combined_dict[pos].hap, combined_dict[pos].hap ^ b)
                           for pos,base in reads_dict[read_id]
-                              if 0.01 <= popcount(combined_dict[pos].hap)/N <= 0.99)  #Include only biallelic SNPs with MAF of at least 0.01. Also, ^b flips all bits of the binary number, hap_tab[ind] using bitwise xor operator. 
-        
+                              if 0.01 <= popcount(combined_dict[pos].hap)/N <= 0.99)  #Include only biallelic SNPs with MAF of at least 0.01. Also, ^b flips all bits of the binary number, hap_tab[ind] using bitwise xor operator.
+
         score_dict[read_id] = sum(min_HF <= popcount(reduce(and_,hap))/N <= (1-min_HF)
                                   for hap in product(*haplotypes) if len(hap)!=0)
 
@@ -138,7 +137,7 @@ def build_score_dict(reads_dict,combined_dict,number_of_haplotypes,min_HF):
 def build_aux_dict(obs_tab,combined_dict):
     """ Returns a dictionary that lists chromosome positions of SNPs and gives a
     list of read IDs for all the reads that overlap with the SNP. """
-    aux_dict = collections.defaultdict(list) ### aux_dict is a 
+    aux_dict = collections.defaultdict(list) ### aux_dict is a
     for pos, read_id, base in obs_tab:
         if pos in combined_dict and (base==combined_dict[pos].ref or base==combined_dict[pos].alt):
             aux_dict[pos].append(read_id)
@@ -153,16 +152,16 @@ def iter_windows(obs_tab,combined_dict,score_dict,window_size,offset,min_reads,
     max_dist = 100000 #maximal distance between consecutive observed alleles.
     max_win_size = 350000 #maximal genomic window size
     initial_win_size = 50000 #initial genomic window size
-    
+
     adaptive, window_size = (False, int(window_size)) if window_size else (True, initial_win_size)
     offset = int(offset)
     aux_dict = build_aux_dict(obs_tab,combined_dict)
-            
+
     first, last = obs_tab[0].pos + offset, obs_tab[-1].pos + window_size
     a, b, readIDs_in_window = first, first+window_size, set()
-    
+
     for pos, overlapping_reads in aux_dict.items():
-        if pos<first: continue   
+        if pos<first: continue
         while b<last:
             if a<=pos<b:
                 readIDs_in_window.update(read_ID for read_ID in overlapping_reads if minimal_score<=score_dict[read_ID])
@@ -171,7 +170,7 @@ def iter_windows(obs_tab,combined_dict,score_dict,window_size,offset,min_reads,
                 b += 10000
             else:
                 yield ((a,b-1), readIDs_in_window) #the genomic window includes both endpoints.
-                a, b, readIDs_in_window = b, b+window_size, set() 
+                a, b, readIDs_in_window = b, b+window_size, set()
 
 def pick_reads(reads_dict,score_dict,read_IDs,max_reads):
     """ Draws up to max_reads reads from a given genomic window. """
@@ -181,73 +180,73 @@ def pick_reads(reads_dict,score_dict,read_IDs,max_reads):
 
 def effective_number_of_subsamples(num_of_reads,min_reads,max_reads,subsamples):
     """ Ensures that the number of requested subsamples is not larger than the
-    number of unique subsamples. """ 
-    
+    number of unique subsamples. """
+
     if  min_reads <= num_of_reads > max_reads:
         eff_subsamples = min(comb(num_of_reads,max_reads),subsamples)
     elif min_reads <= num_of_reads <= max_reads:
         eff_subsamples = min(num_of_reads,subsamples)
     else:
         eff_subsamples = 0
-        
+
     return eff_subsamples
-        
+
 def bootstrap(obs_tab, leg_tab, hap_tab, sam_tab, number_of_haplotypes,
               models_dict, window_size, subsamples, offset, min_reads,
               max_reads, minimal_score, min_HF, ancestral_proportion):
     """ Applies a bootstrap approach in which: (i) the resample size is smaller
     than the sample size and (ii) resampling is done without replacement. """
-    
+
     min_reads == max(min_reads,3) # Due to the bootstrap approach, min_reads must be at least 3.
-    max_reads == max(max_reads,2) # Our statistical models require at least 2 reads.    
-    
+    max_reads == max(max_reads,2) # Our statistical models require at least 2 reads.
+
 
     combined_dict = build_combined(leg_tab, hap_tab)
     reads_dict = build_reads_dict(obs_tab, combined_dict)
     score_dict = build_score_dict(reads_dict, combined_dict, number_of_haplotypes, min_HF)
-    windows_dict = dict(iter_windows(obs_tab, combined_dict, score_dict, window_size, offset, min_reads, max_reads, minimal_score))       
-    
+    windows_dict = dict(iter_windows(obs_tab, combined_dict, score_dict, window_size, offset, min_reads, max_reads, minimal_score))
+
     ancestry = {row.group2 for row in sam_tab}
     if len(ancestry)==2 and 0<ancestral_proportion.proportion<1 and ancestral_proportion.group2 in ancestry:
         proportions = {i:ancestral_proportion.proportion if ancestral_proportion.group2==i else 1-ancestral_proportion.proportion for i in ancestry}
-        print('Assuming the following ancestry proportions:', proportions) 
-        examine = complex_admixture(obs_tab, leg_tab, hap_tab, sam_tab, models_dict, number_of_haplotypes, ancestral_proportion)
+        print('Assuming the following ancestry proportions:', proportions)
+        examine = distant_admixture(obs_tab, leg_tab, hap_tab, sam_tab, models_dict, number_of_haplotypes, ancestral_proportion)
     elif len(ancestry)==2:
-        print('Assuming F1-admixture between %s and %s.' % tuple(ancestry)) 
+        print('Assuming F1-admixture between %s and %s.' % tuple(ancestry))
         examine = f1_admixture(obs_tab, leg_tab, hap_tab, sam_tab, models_dict, number_of_haplotypes)
     else:
         print('Assuming one ancestral population: %s.' % tuple(ancestry))
         examine = homogeneous(obs_tab, leg_tab, hap_tab, sam_tab, models_dict, number_of_haplotypes)
- 
+
     likelihoods = {}
-    
-    for k,(window,read_IDs) in enumerate(windows_dict.items()):    
+
+    for k,(window,read_IDs) in enumerate(windows_dict.items()):
         sys.stdout.write(f"\r[{'=' * (33*(k+1)//len(windows_dict)):{33}s}] {int(100*(k+1)/len(windows_dict))}%"); sys.stdout.flush()
-        
+
         effN = effective_number_of_subsamples(len(read_IDs),min_reads,max_reads,subsamples)
         if effN>0:
             likelihoods[window] = tuple(examine.get_likelihoods(*pick_reads(reads_dict,score_dict,read_IDs,max_reads)) for _ in range(effN))
-    
+
     return likelihoods, windows_dict, examine.fraction_of_matches
-        
+
 def statistics(likelihoods,windows_dict):
     """ Compares likelihoods of different aneuploidy scenarios and extracts
     useful information about the genmoic windows. """
-    
+
     if likelihoods:
-        window_size_mean, window_size_std = mean_and_std([j-i+1 for (i,j) in likelihoods])    
+        window_size_mean, window_size_std = mean_and_std([j-i+1 for (i,j) in likelihoods])
         reads_mean, reads_std = mean_and_std([len(read_IDs) for window,read_IDs in windows_dict.items() if window in likelihoods])
         num_of_windows = len(likelihoods)
-        
-        
+
+
         pairs = (('BPH','SPH'), ('BPH','disomy'), ('disomy','SPH'), ('SPH','monosomy')); _ = {};
         LLRs_per_genomic_window = {(i,j): {window:  mean_and_var([*starmap(LLR, ((_[i], _[j]) for _['monosomy'], _['disomy'], _['SPH'], _['BPH'] in L))])
                            for window,L in likelihoods.items()} for i,j in pairs}
-        
+
         LLRs_per_chromosome = {pair: summarize(*zip(*stat.values())) for pair,stat in LLRs_per_genomic_window.items()}
-        
+
         result = {'num_of_windows': num_of_windows,
-                  'reads_mean': reads_mean, 
+                  'reads_mean': reads_mean,
                   'reads_std': reads_std,
                   'window_size_mean': window_size_mean,
                   'window_size_std': window_size_std,
@@ -266,16 +265,16 @@ def print_summary(obs_filename,info):
 
     if S.get('LLRs_per_chromosome',None):
         for (i,j), L in S['LLRs_per_chromosome'].items():
-            print(f"--- LLR between {i:s} and {j:s} ----")        
+            print(f"--- LLR between {i:s} and {j:s} ----")
             print(f"Mean LLR: {L['mean']:.3f}, Standard error of the mean LLR: {L['std_of_mean']:.3f}")
             print(f"Fraction of genomic windows with a negative LLR: {L['fraction_of_negative_LLRs']:.3f}")
 
 def save_results(likelihoods,info,compress,obs_filename,output_filename,output_dir):
     """ Saves the likelihoods together with information about the chromosome
-        number, depth of coverage, ancestry, statistics of the genomic windows 
+        number, depth of coverage, ancestry, statistics of the genomic windows
         and flags that were used. Also, data compression is supported in gzip
         and bzip2 formats. """
-        
+
     Open = {'bz2': bz2.open, 'gz': gzip.open}.get(compress, open)
     ext = ('.'+compress) * (compress in ('bz2','gz'))
     obs_filename_stripped =  obs_filename.rsplit('/', 1).pop()
@@ -285,10 +284,10 @@ def save_results(likelihoods,info,compress,obs_filename,output_filename,output_d
     if output_dir!='' and not os.path.exists(output_dir): os.makedirs(output_dir)
     with Open(output_dir + output_filename, "wb") as f:
         pickle.dump(likelihoods, f, protocol=4)
-        pickle.dump(info, f, protocol=4)       
+        pickle.dump(info, f, protocol=4)
     return output_dir + output_filename
-            
-def aneuploidy_test(obs_filename,leg_filename,hap_filename,sam_filename,
+
+def aneuploidy_test(obs_filename,leg_filename,hap_filename,samp_filename,
                     window_size,subsamples,offset,min_reads,max_reads,
                     minimal_score,min_HF,output_filename,compress,**kwargs):
     """ Returns a dictionary that lists the boundaries of approximately
@@ -297,33 +296,46 @@ def aneuploidy_test(obs_filename,leg_filename,hap_filename,sam_filename,
     It also returns a dictionary with various run parameters. """
 
     time0 = time.time()
-    
+
     random.seed(a=kwargs.get('seed',None), version=2) #I should make sure that a=None after finishing to debug the code.
     path = os.path.realpath(__file__).rsplit('/', 1)[0] + '/MODELS/'
     models_filename = kwargs.get('model', path + ('MODELS18.p' if max_reads>16 else ('MODELS16.p' if max_reads>12 else 'MODELS12.p')))
     ancestral_proportion = (lambda x,y: admix_tuple(str(x),float(y)))(*kwargs.get('ancestral_proportion',('None','-1')))
 
-    Open = {'bz2': bz2.open, 'gzip': gzip.open}.get(obs_filename.rpartition('.')[-1], open)    
-    with Open(obs_filename, 'rb') as f:
-        obs_tab = pickle.load(f)
-        info = pickle.load(f)
-        
-    leg_tab = read_impute2(leg_filename, filetype='leg')
-    hap_tab, number_of_haplotypes = read_impute2(hap_filename, filetype='hap')
-    sam_tab = read_impute2(sam_filename, filetype='sam')
-    
+    load = lambda filename: {'bz2': bz2.open, 'gz': gzip.open}.get(filename.rsplit('.',1)[1], open)  #Adjusts the opening method according to the file extension.
+
+    open_hap = load(hap_filename)
+    with open_hap(hap_filename,'rb') as hap_in:
+        hap_tab, number_of_haplotypes = pickle.load(hap_in)
+
+    open_leg = load(leg_filename)
+    with open_leg(leg_filename,'rb') as leg_in:
+        leg_tab = pickle.load(leg_in)
+
+
+    open_samp = load(samp_filename)
+    with open_samp(samp_filename,'rb') as samp_in:
+        sam_tab = pickle.load(samp_in)
+
+    open_obs = load(obs_filename)
+    with open_obs(obs_filename, 'rb') as obs_in:
+        obs_tab = pickle.load(obs_in)
+        info = pickle.load(obs_in)
+
+    open_model = load(models_filename)
+    with open_model(models_filename, 'rb') as model_in:
+        models_dict = pickle.load(model_in)
+
+
     ancestry = {row.group2 for row in sam_tab}
     if len(ancestry)>2: print('warning: individuals in the sample file are associated with more than two populations.')
-    
-    load_model = bz2.BZ2File if models_filename[-6:]=='.p.bz2' else open
-    with load_model(models_filename, 'rb') as f:
-        models_dict = pickle.load(f)
+
 
     likelihoods, windows_dict, matched_alleles = bootstrap(obs_tab, leg_tab, hap_tab, sam_tab, number_of_haplotypes, models_dict, window_size, subsamples, offset, min_reads, max_reads, minimal_score, min_HF, ancestral_proportion)
-     
+
     some_statistics = {'matched_alleles': matched_alleles,
                        'runtime': time.time()-time0}
-    
+
     info.update({'ancestry': ancestry,
                  'window_size': window_size,
                  'subsamples': subsamples,
@@ -334,18 +346,18 @@ def aneuploidy_test(obs_filename,leg_filename,hap_filename,sam_filename,
                  'min_HF': min_HF,
                  'statistics': {**statistics(likelihoods,windows_dict), **some_statistics}
                  })
-    
+
     if len(ancestry)==2 and 0<ancestral_proportion.proportion<1 and ancestral_proportion.group2 in ancestry:
         info['proportions'] = {i:ancestral_proportion.proportion if ancestral_proportion.group2==i else 1-ancestral_proportion.proportion for i in ancestry}
-        
+
     if output_filename!=None:
         save_results(likelihoods,info,compress,obs_filename,output_filename,kwargs.get('output_dir', 'results'))
-    
+
     print_summary(obs_filename,info)
-    
+
     time1 = time.time()
     print('Done calculating LLRs for all the genomic windows in %.3f sec.' % ((time1-time0)))
-    
+
     return likelihoods, info
 
 if __name__ == "__main__":
@@ -357,15 +369,15 @@ def aneuploidy_test(obs_filename,leg_filename,hap_filename,sam_filename,
                 '(Single Parental Homolog) correspond to chromosome gains'
                 'involving identical homologs.')
     parser.add_argument('obs_filename', metavar='OBS_FILENAME', type=str,
-                        help='A pickle file created by MAKE_OBS_TAB, containing base observations at known SNP positions.')
+                        help='A observations file created by MAKE_OBS_TAB.')
     parser.add_argument('leg_filename', metavar='LEG_FILENAME', type=str,
-                        help='IMPUTE2 legend file')
+                        help='A legend file of the reference panel.')
     parser.add_argument('hap_filename', metavar='HAP_FILENAME', type=str,
-                        help='IMPUTE2 haplotype file')
+                        help='A haplotype file of the reference panel.')
     parser.add_argument('sam_filename', metavar='SAM_FILENAME', type=str,
-                        help='IMPUTE2 samples file')
+                        help='A samples file of the reference panel.')
     parser.add_argument('-a', '--ancestral-proportion', metavar='STR FLOAT', type=str, nargs=2, default=['None','-1'],
-                        help='Assume a complex admixture with a certain ancestry proportion, e.g, EUR 0.8.')
+                        help='Assume a distant admixture with a certain ancestry proportion, e.g, EUR 0.8.')
     parser.add_argument('-w', '--window-size', type=int,
                         metavar='INT', default='100000',
                         help='Specifies the size of the genomic window. The default value is 100 kbp. When given a zero-size genomic window, it adjusts the size of the window to include min-reads reads.')
@@ -417,9 +429,9 @@ def aneuploidy_test_demo(obs_filename='SWI-L-10-27-May-2020_S38.chr6.obs.p.bz2',
                 output_filename = '',
                 compress = 'bz2',
                 seed=0)
-    
-   
-    
+
+
+
     LLR_dict, info = aneuploidy_test(**args)
     return LLR_dict, info"
-"""
\ No newline at end of file
+"""
diff --git a/DISTANT_ADMIXTURE_MODELS.py b/DISTANT_ADMIXTURE_MODELS.py
index b2a5469..b0c9060 100644
--- a/DISTANT_ADMIXTURE_MODELS.py
+++ b/DISTANT_ADMIXTURE_MODELS.py
@@ -2,11 +2,11 @@
 # -*- coding: utf-8 -*-
 
 """
-COMPLEX_ADMIXTURE_MODELS
+DISTANT_ADMIXTURE_MODELS
 
 Given reads that originated form the same genomic window and a reference panel
 of two populations, the likelihood of observed reads under four scenarios,
-namely, monosomy, disomy, SPH and BPH is calculated. This module is for complex
+namely, monosomy, disomy, SPH and BPH is calculated. This module is for distant
 admixtures, where each descendant haplotype has a certain probability to
 originate from one of two ancestral populations.
 
@@ -18,7 +18,7 @@
 Aug 10, 2021
 """
 
-import pickle, os, sys, bz2, collections
+import pickle, os, sys, bz2, collections, gzip
 
 from functools import reduce
 from operator import and_, itemgetter
@@ -37,12 +37,11 @@ def popcount(x):
         """ Counts non-zero bits in positive integer. """
         return bin(x).count('1')
 
-class complex_admixture:
-    """ Based on two IMPUTE2 arrays, which contain the legend and haplotypes,
-    and a dictionary with statisitcal models (models_dict), it allows to
-    calculate the likelihoods of observed alleles under various statistical
-    models (monosomy, disomy, SPH and BPH). """
-
+class distant_admixture:
+    """ Based on the statisitcal models (models_dict) and the reference panel
+    (leg_tab, hap_tab and sam_tab), it allows to calculate the likelihoods of
+    observed alleles under various statistical models (monosomy, disomy, SPH
+    and BPH). """
 
     def __init__(self, obs_tab, leg_tab, hap_tab, sam_tab, models_dict, total_number_of_haplotypes, admixture):
         """ Initialize the attributes of the class. """
@@ -101,7 +100,7 @@ def build_hap_dict(self, obs_tab, leg_tab, hap_tab):
 
         fraction_of_matches = 1-mismatches/len(obs_tab)
 
-        print('Algorithm for complex admixtures: %.2f%% of the observed alleles matched the reference panel.' % (100*fraction_of_matches))
+        print('Algorithm for distant admixtures: %.2f%% of the observed alleles matched the reference panel.' % (100*fraction_of_matches))
 
         return hap_dict, fraction_of_matches
 
@@ -288,13 +287,11 @@ def get_likelihoods(self, *x):
             result = self.likelihoods(*x)
         return result
 
-def wrapper_of_complex_admixture_for_debugging(obs_filename,leg_filename,hap_filename,sample_filename,models_filename,admixture):
-    """ Wrapper function of the class complex_admixture. It receives an observations
-    file, IMPUTE2 legend file, IMPUTE2 haplotypes file, IMPUTE2 samples file,
-    and a file with four statistical models. Based on the given data it creates
-    and returns an instance of the class. """
-
-    from MAKE_OBS_TAB import read_impute2
+def wrapper_of_distant_admixture_for_debugging(obs_filename,leg_filename,hap_filename,sample_filename,models_filename,admixture):
+    """ Wrapper function of the class 'distant_admixture'. It receives an
+    observations file, legend file, haplotypes file, samples file and a file
+    with the statistical models. Based on the given data it creates and returns
+    an instance of the class. """
 
     if not os.path.isfile(obs_filename): raise Exception('Error: OBS file does not exist.')
     if not os.path.isfile(leg_filename): raise Exception('Error: LEGEND file does not exist.')
@@ -302,20 +299,30 @@ def wrapper_of_complex_admixture_for_debugging(obs_filename,leg_filename,hap_fil
     if not os.path.isfile(sample_filename): raise Exception('Error: SAMPLE file does not exist.')
     if not os.path.isfile(models_filename): raise Exception('Error: MODELS file does not exist.')
 
-    leg_tab = read_impute2(leg_filename, filetype='leg')
-    hap_tab, total_number_of_haplotypes = read_impute2(hap_filename, filetype='hap')
-    sam_tab  = read_impute2(sample_filename, filetype='sam')
+    load = lambda filename: {'bz2': bz2.open, 'gz': gzip.open}.get(filename.rsplit('.',1)[1], open)  #Adjusts the opening method according to the file extension.
+
+    open_hap = load(hap_filename)
+    with open_hap(hap_filename,'rb') as hap_in:
+        hap_tab, total_number_of_haplotypes = pickle.load(hap_in)
+
+    open_leg = load(leg_filename)
+    with open_leg(leg_filename,'rb') as leg_in:
+        leg_tab = pickle.load(leg_in)
+
+    open_samp = load(sample_filename)
+    with open_samp(sample_filename,'rb') as samp_in:
+        sam_tab = pickle.load(samp_in)
 
-    load_obs = bz2.BZ2File if obs_filename[-6:]=='.p.bz2' else open
-    with load_obs(obs_filename, 'rb') as f:
-        obs_tab = pickle.load(f)
+    open_obs = load(obs_filename)
+    with open_obs(obs_filename, 'rb') as obs_in:
+        obs_tab = pickle.load(obs_in)
         #info = pickle.load(f)
 
-    load_model = bz2.BZ2File if models_filename[-6:]=='.p.bz2' else open
-    with load_model(models_filename, 'rb') as f:
-        models_dict = pickle.load(f)
+    open_model = load(models_filename)
+    with open_model(models_filename, 'rb') as model_in:
+        models_dict = pickle.load(model_in)
 
-    return complex_admixture(obs_tab, leg_tab, hap_tab, sam_tab, models_dict, total_number_of_haplotypes, admixture)
+    return distant_admixture(obs_tab, leg_tab, hap_tab, sam_tab, models_dict, total_number_of_haplotypes, admixture)
 
 if __name__ != "__main__":
     print('The module COMPLEX_ADMIXTURE_MODELS was imported.')
@@ -341,7 +348,7 @@ def wrapper_of_complex_admixture_for_debugging(obs_filename,leg_filename,hap_fil
     models_filename = 'MODELS/MODELS16.p'
     admixture = admix_tuple('EUR',0.8)
 
-    A = wrapper_of_complex_admixture_for_debugging(obs_filename,leg_filename,hap_filename,sam_filename,models_filename,admixture)
+    A = wrapper_of_distant_admixture_for_debugging(obs_filename,leg_filename,hap_filename,sam_filename,models_filename,admixture)
 
     alleles = tuple(A.hap_dict.keys())
 
diff --git a/EXTRACT_GENOTYPES.py b/EXTRACT_GENOTYPES.py
new file mode 100644
index 0000000..9ad91b2
--- /dev/null
+++ b/EXTRACT_GENOTYPES.py
@@ -0,0 +1,119 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+EXTRACT_GENOTYPES
+
+Simulates an observation table, obs_tab of a haploid, using phased genotypes from a LD-PGTA reference panel.
+
+Daniel Ariad (daniel@ariad.org)
+Jan 13, 2021
+"""
+import pickle, os, sys, time, argparse, random, gzip, collections
+
+leg_tuple = collections.namedtuple('leg_tuple', ('chr_id', 'pos', 'ref', 'alt')) #Encodes the rows of the legend table
+sam_tuple = collections.namedtuple('sam_tuple', ('sample_id', 'group1', 'group2', 'sex')) #Encodes the rows of the samples table
+obs_tuple = collections.namedtuple('obs_tuple', ('pos', 'read_id', 'base')) #Encodes the rows of the observations table
+    
+def get_haplotypes(sample_filename, hap_filename, sample_id):
+    """ Extracts haplotypes that correspond to a specific sample ID. """
+
+    with gzip.open(sample_filename, 'rb') as sam_in:
+        SAM = pickle.load(sam_in)
+    
+    samples = [s.sample_id for s in SAM]
+    
+    if sample_id in samples:
+        ind = samples[::-1].index(sample_id)
+    else:
+        raise Exception('Error: sample_id not found.')
+
+    a = -2*(ind+1)
+    b = None if ind==0 else -2*(ind+1)+2
+    
+    #print(samples[-(ind+1)])
+    string2tuple = {'00': (0,0), '01': (0,1), '10': (1,0), '11': (1,1), '': (0,0), '0': (0,0), '1': (0,1)}
+    with gzip.open(hap_filename,'rb') as hap_in:
+        hap_tab, number_of_haplotypes = pickle.load(hap_in)
+    result = [string2tuple[bin(h)[2:][a:b]] for h in hap_tab]
+
+    return result
+
+def extract(leg_filename,hap_filename,samp_filename,chr_id,sample_id,**kwargs):
+    """ Builds an observation tables of effective haploids by extracting 
+        phased genotypes from a LD-PGTA reference panel. """
+
+    a = time.time()
+    random.seed(None,version=2)
+
+    genotypes = kwargs.get('genotypes', 'AB')
+
+    output_dir = kwargs.get('output_dir', '')
+    if output_dir!='' and not os.path.exists(output_dir): os.makedirs(output_dir)
+    output_dir += '/' if output_dir[-1:]!='/' else ''
+
+    haplotypes = get_haplotypes(samp_filename, hap_filename, sample_id)
+    
+    with gzip.open(leg_filename,'rb') as leg_in:
+        legend = pickle.load(leg_in)
+
+    info = {'chr_id': chr_id,
+            'depth': 1,
+            'read_length': 1,
+            'sample_id': sample_id}
+
+    if genotypes in ('A','AB'):
+        obs_tab1 = tuple(obs_tuple(pos, 'XXX', alt if allele1 else ref)
+                             for (chrID,pos,ref,alt),(allele1,allele2) in zip(legend,haplotypes)
+                                 if chr_id==chrID)
+    
+        with open(output_dir+sample_id+'A.%s.hg38.obs.p' % chr_id, 'wb') as binfile:
+            info1 = {**info, 'haplotype': 'A'}
+            pickle.dump(obs_tab1, binfile, protocol=4)
+            pickle.dump(info1 , binfile, protocol=4)
+
+    if genotypes in ('B','AB'):
+        obs_tab2 = tuple(obs_tuple(pos, 'XXX', alt if allele2 else ref)
+                            for (chrID,pos,ref,alt),(allele1,allele2) in zip(legend,haplotypes)
+                                if chr_id==chrID)
+    
+        with open(output_dir+sample_id+'B.%s.hg38.obs.p' % chr_id, 'wb') as binfile:
+            info2 = {**info, 'haplotype': 'B'}
+            pickle.dump(obs_tab2, binfile, protocol=4)
+            pickle.dump(info2, binfile, protocol=4)
+
+    b = time.time()
+    print('Done in %.3f sec.' % ((b-a)))
+
+    return 0
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser( description='Simulates two observation tables of haploids, using phased genotypes from a LD-PGTA reference panel. ')
+    
+    parser.add_argument('leg_filename', metavar='legend_filename', type=str,
+                        help='IMPUTE2 legend file')
+    parser.add_argument('hap_filename', metavar='haplotypes_filename', type=str,
+                        help='IMPUTE2 haplotypes file')
+    parser.add_argument('samp_filename', metavar='samples_filename', type=str,
+                        help='IMPUTE2 samples file')
+    parser.add_argument('chr_id', metavar='chromosomeID', type=str,
+                        help='Chromosome ID')
+    parser.add_argument('sample_id', metavar='sampleID', type=str,
+                        help='Sample ID')
+    parser.add_argument('-g', '--genotypes', metavar='A/B/AB', type=str, default='AB',
+                        help='Which of the individual\'s haplotypes should be used. For each specified haplotype, one haploid would be genereated. Default is both (AB).')
+
+
+    args = parser.parse_args()
+    sys.exit(extract(**vars(args)))
+
+
+def test():
+    sample_id = 'HG00097'
+    chr_id = 'chr21'
+    leg_filename = f'EUR_panel.hg38/{chr_id:s}_EUR_panel.legend.gz'
+    hap_filename = f'EUR_panel.hg38/{chr_id:s}_EUR_panel.hap.gz'
+    samp_filename = 'EUR_panel.hg38/EUR_panel.samples.gz'
+
+    work_dir='results_TEMP'
+    return extract(leg_filename,hap_filename,samp_filename,chr_id,sample_id,output_dir=work_dir)
diff --git a/F1_ADMIXTURE_MODELS.py b/F1_ADMIXTURE_MODELS.py
index 0a2424e..7a155c1 100644
--- a/F1_ADMIXTURE_MODELS.py
+++ b/F1_ADMIXTURE_MODELS.py
@@ -6,7 +6,7 @@
 
 Given reads that originated form the same genomic window and a reference panel
 of two populations, the likelihood of observed reads under four scenarios,
-namely, monosomy, disomy, SPH and BPH is calculated. This module is for 
+namely, monosomy, disomy, SPH and BPH is calculated. This module is for
 F1-admixtures, where the parents are associated with different ancestral
 populations.
 
@@ -18,7 +18,7 @@
 Dec 21, 2020
 """
 
-import pickle, os, sys, bz2, collections
+import pickle, os, sys, bz2, collections, gzip
 
 from functools import reduce
 from operator import and_, itemgetter
@@ -37,19 +37,19 @@ def popcount(x):
         return bin(x).count('1')
 
 class f1_admixture:
-    """ Based on two IMPUTE2 arrays, which contain the legend and haplotypes,
-    and a dictionary with statisitcal models (models_dict), it allows to
-    calculate the likelihoods of observed alleles under various statistical
-    models (monosomy, disomy, SPH and BPH). """
-   
-    
+    """ Based on the statisitcal models (models_dict) and the reference panel
+    (leg_tab, hap_tab and sam_tab), it allows to calculate the likelihoods of
+    observed alleles under various statistical models (monosomy, disomy, SPH
+    and BPH). """
+
+
     def __init__(self, obs_tab, leg_tab, hap_tab, sam_tab, models_dict, total_number_of_haplotypes):
         """ Initialize the attributes of the class. """
-        
-        if len(leg_tab)!=len(hap_tab): 
+
+        if len(leg_tab)!=len(hap_tab):
             raise Exception('Error: the number of SNPs in the LEGEND file differ from the number of SNPs in the HAP file.')
-            
-        if total_number_of_haplotypes!=2*len(sam_tab): 
+
+        if total_number_of_haplotypes!=2*len(sam_tab):
             raise Exception('Error: the number of diploid samples in the SAMPLE file differ from the number of haplotypes in the HAP file.')
 
         self.total_number_of_haplotypes_in_reference_panel = total_number_of_haplotypes
@@ -61,7 +61,7 @@ def subpanels(self, sam_tab):
         """ Differentiates between the two groups that compose the reference
         panel. Then, all the haplotypes that are associated with each group are
         flagged using a binary representation marks and counted. """
-        
+
         differentiate = [row.group2 == sam_tab[0].group2 for row in sam_tab for i in (1,2)]
         flag0 = sum(v<<i for i, v in enumerate(differentiate[::-1]))
         flag1 = flag0 ^ ((1 << self.total_number_of_haplotypes_in_reference_panel) - 1)
@@ -72,13 +72,13 @@ def subpanels(self, sam_tab):
         N1 = self.total_number_of_haplotypes_in_reference_panel - N0
         number_of_haplotypes_in_reference_subpanel = (N0,N1)
         return flags, number_of_haplotypes_in_reference_subpanel, name2id
-    
+
     def build_hap_dict(self, obs_tab, leg_tab, hap_tab):
         """ Returns a dictionary that lists SNP alleles and gives their
         relevent row from haplotypes table. The row is stored as bits, where
         True means that the haplotype contains the allele. We denote the
         returned dictionary as the reference panel. """
-    
+
         hap_dict = dict()
         mismatches = 0
         combined = {pos: (ref,alt,hap) for (chr_id,pos,ref,alt),hap in zip(leg_tab, hap_tab)}
@@ -86,20 +86,20 @@ def build_hap_dict(self, obs_tab, leg_tab, hap_tab):
 
 
         b = (1 << self.total_number_of_haplotypes_in_reference_panel) - 1 #### equivalent to int('1'*number_of_haplotypes,2)
-        
+
         for (pos, read_id, base) in obs_tab:
             ref, alt, hap = combined.get(pos, missing)
             if base==alt:
                 hap_dict[(pos,base)] = hap
             elif base==ref:
-                hap_dict[(pos,base)] = hap ^ b ### ^b flips all bits of the binary number, hap_tab[ind] using bitwise xor operator. 
+                hap_dict[(pos,base)] = hap ^ b ### ^b flips all bits of the binary number, hap_tab[ind] using bitwise xor operator.
             else:
                 mismatches += 1
-    
+
         fraction_of_matches = 1-mismatches/len(obs_tab)
-        
+
         print('Algorithm for F1-admixtures: %.2f%% of the observed alleles matched the reference panel.' % (100*fraction_of_matches))
-    
+
         return hap_dict, fraction_of_matches
 
     def intrenal_hap_dict(self, *alleles):
@@ -131,8 +131,8 @@ def intrenal_hap_dict(self, *alleles):
 
     def joint_frequencies_combo(self, *alleles, group2_id, normalize):
         """ Based on the reference panel, it calculates joint frequencies of
-            observed alleles. The function arguments are alleles, that is, 
-            tuples of position and base, e.g., (100,'T'), (123, 'A') and 
+            observed alleles. The function arguments are alleles, that is,
+            tuples of position and base, e.g., (100,'T'), (123, 'A') and
             (386, 'C'). Each allele is enumerated according to the order it
             was received by the function. The function returns a dictionary that
             lists all the possible subgroups of the given alleles. Each key in
@@ -164,46 +164,46 @@ def joint_frequencies_combo(self, *alleles, group2_id, normalize):
         if normalize:
             N = self.num_of_hap_in_ref_subpanel[group2_id]
             result = {k: v/N for k,v in result.items()}
-        
+
         return result
-    
+
     def likelihoods(self, *alleles):
         """ Calculates the likelihood to observe a set with alleles
         and haplotypes under four scenarios, namely, monosomy, disomy, SPH
         and BPH. """
-        
+
         models = self.models_dict[len(alleles)]
-        F = self.joint_frequencies_combo(*alleles, group2_id=0, normalize=False) 
+        F = self.joint_frequencies_combo(*alleles, group2_id=0, normalize=False)
         M = self.num_of_hap_in_ref_subpanel[0] #Divide values by M to normalize the joint frequencies, F.
-        G = self.joint_frequencies_combo(*alleles, group2_id=1, normalize=False) 
+        G = self.joint_frequencies_combo(*alleles, group2_id=1, normalize=False)
         N = self.num_of_hap_in_ref_subpanel[1] #Divide values by N to normalize the joint frequencies, G.
 
         ### BPH ###
         (((A0, A1),((B0,),)),) = models['BPH'][1].items()
         BPH = (F[B0] / M + G[B0] / N) * A0 / ( 2 * A1 )
 
-        BPH += sum( sum( (F[B0] * F[B1] / M**2 + G[B0] * G[B1] / N**2 
-                          + 2 * (F[B0] * G[B1] + G[B0] * F[B1]) / ( M * N ) ) 
+        BPH += sum( sum( (F[B0] * F[B1] / M**2 + G[B0] * G[B1] / N**2
+                          + 2 * (F[B0] * G[B1] + G[B0] * F[B1]) / ( M * N ) )
                         for (B0, B1) in C) * A0 / A1
                            for (A0, A1), C in models['BPH'][2].items()) / 6
 
         if len(alleles)>2:
-            BPH += sum( sum(F[B0] * sum(( (F[B1] * G[B2] + G[B1] * F[B2]) / M + G[B1] * G[B2] / N) 
+            BPH += sum( sum(F[B0] * sum(( (F[B1] * G[B2] + G[B1] * F[B2]) / M + G[B1] * G[B2] / N)
                     for (B1, B2) in C[B0]) for B0 in C) * A0 / A1
                        for (A0, A1), C in models['BPH'][3].items()) / (6 * M * N)
-            
-            BPH += sum( sum(G[B0] * sum(((F[B1] * G[B2] + G[B1] * F[B2]) / N + F[B1] * F[B2] / M) 
+
+            BPH += sum( sum(G[B0] * sum(((F[B1] * G[B2] + G[B1] * F[B2]) / N + F[B1] * F[B2] / M)
                     for (B1, B2) in C[B0]) for B0 in C) * A0 / A1
                        for (A0, A1), C in models['BPH'][3].items()) / (6 * M * N)
 
         ### SPH ###
         (((A0, A1),((B0,),)),) = models['SPH'][1].items()
-        SPH = (F[B0] / M + G[B0] / N) * A0 / ( 2 * A1 ) 
+        SPH = (F[B0] / M + G[B0] / N) * A0 / ( 2 * A1 )
 
         SPH += sum( sum((F[B0] * G[B1] + G[B0] * F[B1]) for (B0, B1) in C) * A0 / A1
                    for (A0, A1), C in models['SPH'][2].items()) / ( 2 * M * N)
 
-        
+
         ### DIPLOIDY ###
         (((A0, A1),((B0,),)),) = models['DISOMY'][1].items()
         DISOMY = ( F[B0] / M + G[B0] / N ) * A0 / ( 2 * A1 )
@@ -213,18 +213,18 @@ def likelihoods(self, *alleles):
 
         ### MONOSOMY ###
         ((B0,),) = models['MONOSOMY'][1][(1,1)]
-        MONOSOMY = ( F[B0] / M + G[B0] / N ) / 2  
-        
+        MONOSOMY = ( F[B0] / M + G[B0] / N ) / 2
+
 
         result = (MONOSOMY, DISOMY, SPH, BPH)
         return result
-    
+
     def likelihoods2(self, *alleles):
         """ Calculates the likelihood to observe two alleles/haplotypes
         under four scenarios, namely, monosomy, disomy, SPH and BPH. """
-        
-        F = self.joint_frequencies_combo(*alleles, group2_id=0, normalize=True) 
-        G = self.joint_frequencies_combo(*alleles, group2_id=1, normalize=True) 
+
+        F = self.joint_frequencies_combo(*alleles, group2_id=0, normalize=True)
+        G = self.joint_frequencies_combo(*alleles, group2_id=1, normalize=True)
         a, b, ab = F[1], F[2], F[3]
         A, B, AB = G[1], G[2], G[3]
         BPH = (2*(b*a+B*A)+3*(AB+ab)+4*(b*A+B*a))/18 #The likelihood of three unmatched haplotypes. #V
@@ -236,9 +236,9 @@ def likelihoods2(self, *alleles):
     def likelihoods3(self, *alleles):
         """ Calculates the likelihood to observe three alleles/haplotypes
         under four scenarios, namely, monosomy, disomy, SPH and BPH. """
-        
-        F = self.joint_frequencies_combo(*alleles, group2_id=0, normalize=True) 
-        G = self.joint_frequencies_combo(*alleles, group2_id=1, normalize=True) 
+
+        F = self.joint_frequencies_combo(*alleles, group2_id=0, normalize=True)
+        G = self.joint_frequencies_combo(*alleles, group2_id=1, normalize=True)
         a, b, ab, c, ac, bc, abc = F[1], F[2], F[3], F[4], F[5], F[6], F[7]
         A, B, AB, C, AC, BC, ABC = G[1], G[2], G[3], G[4], G[5], G[6], G[7]
 
@@ -248,23 +248,23 @@ def likelihoods3(self, *alleles):
         DISOMY = (abc+ab*C+ac*B+bc*A+ABC+AB*c+AC*b+BC*a)/8 #The likelihood of diploidy. #V
         MONOSOMY = (abc+ABC)/2 #The likelihood of monosomy. #V
         return MONOSOMY, DISOMY, SPH, BPH
-    
+
     def likelihoods4(self, *alleles):
         """ Calculates the likelihood to observe four alleles/haplotypes
         under four scenarios, namely, monosomy, disomy, SPH and BPH. """
-        
-        F = self.joint_frequencies_combo(*alleles, group2_id=0, normalize=True) 
-        G = self.joint_frequencies_combo(*alleles, group2_id=1, normalize=True) 
+
+        F = self.joint_frequencies_combo(*alleles, group2_id=0, normalize=True)
+        G = self.joint_frequencies_combo(*alleles, group2_id=1, normalize=True)
         a, b, c, d = F[1], F[2], F[4], F[8],
         ab, ac, ad, bc, bd, cd = F[3], F[5], F[9], F[6], F[10], F[12]
         abc, abd, acd, bcd = F[7], F[11], F[13], F[14]
         abcd = F[15]
-        
+
         A, B, C, D = G[1], G[2], G[4], G[8],
         AB, AC, AD, BC, BD, CD = G[3], G[5], G[9], G[6], G[10], G[12]
         ABC, ABD, ACD, BCD = G[7], G[11], G[13], G[14]
         ABCD = G[15]
-        
+
         BPH = (2*(AB*CD+AC*BD+AD*BC+
                   A*(BCD+B*cd+b*CD+b*cd+C*bd+c*BD+c*bd+D*bc+d*BC+d*bc)+
                   B*(ACD+C*ad+c*AD+c*ad+D*ac+d*AC+d*ac)+
@@ -280,12 +280,12 @@ def likelihoods4(self, *alleles):
         DISOMY = (abcd+abc*D+bcd*A+acd*B+abd*C+ab*CD+ad*BC+ac*BD+ABCD+ABC*d+BCD*a+ACD*b+ABD*c+AB*cd+AD*bc+AC*bd)/16 #The likelihood of diploidy. #V
         MONOSOMY = (abcd+ABCD)/2 #The likelihood of monosomy. #V
         return MONOSOMY, DISOMY, SPH, BPH
-    
+
     def get_likelihoods(self, *x):
-        """ Uses the optimal function to calculate the likelihoods. 
+        """ Uses the optimal function to calculate the likelihoods.
         In general, self.likelihoods can get less than five alleles but the
         dedicated functions are optimized to a certain number of alleles. """
-        
+
         l = len(x)
         if l==2:
             result = self.likelihoods2(*x)
@@ -293,17 +293,15 @@ def get_likelihoods(self, *x):
             result = self.likelihoods3(*x)
         elif l==4:
             result = self.likelihoods4(*x)
-        else: 
+        else:
             result = self.likelihoods(*x)
         return result
-        
-def wrapper_of_f1_admixture_for_debugging(obs_filename,leg_filename,hap_filename,sample_filename,models_filename):
-    """ Wrapper function of the class f1_admixture. It receives an observations
-    file, IMPUTE2 legend file, IMPUTE2 haplotypes file, IMPUTE2 samples file,
-    and a file with four statistical models. Based on the given data it creates
-    and returns an instance of the class. """
 
-    from MAKE_OBS_TAB import read_impute2
+def wrapper_of_f1_admixture_for_debugging(obs_filename,leg_filename,hap_filename,sample_filename,models_filename):
+    """ Wrapper function of the class 'f1_admixture'. It receives an observations
+    file, legend file, haplotypes file, samples file and a file with the
+    statistical models. Based on the given data it creates and returns an
+    instance of the class. """
 
     if not os.path.isfile(obs_filename): raise Exception('Error: OBS file does not exist.')
     if not os.path.isfile(leg_filename): raise Exception('Error: LEGEND file does not exist.')
@@ -311,18 +309,29 @@ def wrapper_of_f1_admixture_for_debugging(obs_filename,leg_filename,hap_filename
     if not os.path.isfile(sample_filename): raise Exception('Error: SAMPLE file does not exist.')
     if not os.path.isfile(models_filename): raise Exception('Error: MODELS file does not exist.')
 
-    leg_tab = read_impute2(leg_filename, filetype='leg')
-    hap_tab, total_number_of_haplotypes = read_impute2(hap_filename, filetype='hap')
-    sam_tab  = read_impute2(sample_filename, filetype='sam')
-    
-    load_obs = bz2.BZ2File if obs_filename[-6:]=='.p.bz2' else open
-    with load_obs(obs_filename, 'rb') as f:
-        obs_tab = pickle.load(f)
-        #info = pickle.load(f)
+    load = lambda filename: {'bz2': bz2.open, 'gz': gzip.open}.get(filename.rsplit('.',1)[1], open)  #Adjusts the opening method according to the file extension.
+
+    open_hap = load(hap_filename)
+    with open_hap(hap_filename,'rb') as hap_in:
+        hap_tab, total_number_of_haplotypes = pickle.load(hap_in)
+
+    open_leg = load(leg_filename)
+    with open_leg(leg_filename,'rb') as leg_in:
+        leg_tab = pickle.load(leg_in)
+
 
-    load_model = bz2.BZ2File if models_filename[-6:]=='.p.bz2' else open
-    with load_model(models_filename, 'rb') as f:
-        models_dict = pickle.load(f)
+    open_samp = load(sample_filename)
+    with open_samp(sample_filename,'rb') as samp_in:
+        sam_tab = pickle.load(samp_in)
+
+    open_obs = load(obs_filename)
+    with open_obs(obs_filename, 'rb') as obs_in:
+        obs_tab = pickle.load(obs_in)
+        #info = pickle.load(obs_in)
+
+    open_model = load(models_filename)
+    with open_model(models_filename, 'rb') as model_in:
+        models_dict = pickle.load(model_in)
 
     return f1_admixture(obs_tab, leg_tab, hap_tab, sam_tab, models_dict, total_number_of_haplotypes)
 
@@ -348,7 +357,7 @@ def wrapper_of_f1_admixture_for_debugging(obs_filename,leg_filename,hap_filename
     leg_filename = '../build_reference_panel/EAS_EUR_panel.hg38.BCFtools/chr6_EAS_EUR_panel.legend.gz'
     sam_filename = '../build_reference_panel/samples_per_panel/EAS_EUR_panel.samples'
     models_filename = 'MODELS/MODELS16.p'
-    
+
     A = wrapper_of_f1_admixture_for_debugging(obs_filename,leg_filename,hap_filename,sam_filename,models_filename)
 
     alleles = tuple(A.hap_dict.keys())
@@ -365,7 +374,7 @@ def wrapper_of_f1_admixture_for_debugging(obs_filename,leg_filename,hap_filename
     x = random.randrange(len(alleles)-16)
     haplotypes = (alleles[x:x+4],alleles[x+4:x+8],alleles[x+8:x+12],alleles[x+12:x+16])
 
-    print('-----joint_frequencies_combo-----')    
+    print('-----joint_frequencies_combo-----')
     print(frequencies0(alleles[x+0]))
     print(frequencies0(*alleles[x:x+4]))
     print(frequencies1(alleles[x+0]))
@@ -399,4 +408,4 @@ def wrapper_of_f1_admixture_for_debugging(obs_filename,leg_filename,hap_filename
 
     print('Done in %.3f sec.' % ((t1-t0)))
 
-"""
\ No newline at end of file
+"""
diff --git a/HOMOGENOUES_MODELS.py b/HOMOGENOUES_MODELS.py
index 68c341e..1c3393b 100644
--- a/HOMOGENOUES_MODELS.py
+++ b/HOMOGENOUES_MODELS.py
@@ -16,7 +16,7 @@
 Dec 21, 2020
 """
 
-import pickle, os, sys, bz2, collections
+import pickle, os, sys, bz2, collections, gzip
 
 from functools import reduce
 from operator import and_, itemgetter
@@ -35,10 +35,9 @@ def popcount(x):
         return bin(x).count('1')
 
 class homogeneous:
-    """ Based on two IMPUTE2 arrays, which contain the legend and haplotypes,
-    and a dictionary with statisitcal models (models_dict), it allows to
-    calculate the likelihoods of observed alleles under various statistical
-    models (monosomy, disomy, SPH and BPH). """
+    """ Based on the statisitcal models (models_dict) and the reference panel
+    (leg_tab, hap_tab and sam_tab), it allows to calculate the likelihoods of
+    observed alleles under various statistical models (monosomy, disomy, SPH and BPH). """
 
 
     def __init__(self, obs_tab, leg_tab, hap_tab, sam_tab, models_dict, number_of_haplotypes):
@@ -241,29 +240,34 @@ def get_likelihoods(self, *x):
         return result
 
 def wrapper_of_homogenoues_for_debugging(obs_filename,leg_filename,hap_filename,models_filename):
-    """ Wrapper function of the class homogeneous. It receives an observations
-    file, IMPUTE2 legend file, IMPUTE2 haplotypes file, and a file with four
+    """ Wrapper function of the class 'homogeneous'. It receives an observations
+    file, legend file, haplotypes file, samples file and a file with the
     statistical models. Based on the given data it creates and returns an
     instance of the class. """
 
-    from MAKE_OBS_TAB import read_impute2
-
     if not os.path.isfile(obs_filename): raise Exception('Error: OBS file does not exist.')
     if not os.path.isfile(leg_filename): raise Exception('Error: LEGEND file does not exist.')
     if not os.path.isfile(hap_filename): raise Exception('Error: HAP file does not exist.')
     if not os.path.isfile(models_filename): raise Exception('Error: MODELS file does not exist.')
 
-    leg_tab = read_impute2(leg_filename, filetype='leg')
-    hap_tab, number_of_haplotypes = read_impute2(hap_filename, filetype='hap')
+    load = lambda filename: {'bz2': bz2.open, 'gz': gzip.open}.get(filename.rsplit('.',1)[1], open)  #Adjusts the opening method according to the file extension.
+
+    open_hap = load(hap_filename)
+    with open_hap(hap_filename,'rb') as hap_in:
+        hap_tab, number_of_haplotypes = pickle.load(hap_in)
+
+    open_leg = load(leg_filename)
+    with open_leg(leg_filename,'rb') as leg_in:
+        leg_tab = pickle.load(leg_in)
 
-    load_obs = bz2.BZ2File if obs_filename[-6:]=='.p.bz2' else open
-    with load_obs(obs_filename, 'rb') as f:
-        obs_tab = pickle.load(f)
+    open_obs = load(obs_filename)
+    with open_obs(obs_filename, 'rb') as obs_in:
+        obs_tab = pickle.load(obs_in)
         #info = pickle.load(f)
 
-    load_model = bz2.BZ2File if models_filename[-6:]=='.p.bz2' else open
-    with load_model(models_filename, 'rb') as f:
-        models_dict = pickle.load(f)
+    open_model = load(models_filename)
+    with open_model(models_filename, 'rb') as model_in:
+        models_dict = pickle.load(model_in)
 
     return homogeneous(obs_tab, leg_tab, hap_tab, None, models_dict, number_of_haplotypes)
 
diff --git a/MAKE_OBS_TAB.py b/MAKE_OBS_TAB.py
index 95e9c30..19626da 100644
--- a/MAKE_OBS_TAB.py
+++ b/MAKE_OBS_TAB.py
@@ -4,8 +4,8 @@
 MAKE_OBS_TAB
 
 This script extracts single base observations at SNP positions from a given
-sequence. It requires an aligned and sorted BAM file with the sequence, as well
-as an IMPUTE2 legend format, which contains the SNPs positions.
+sequence. It requires a BAM file that is sorted according to position, as well
+as a legend format, which contains the SNPs positions.
 The observed alleles, together with their associated read ID, chromosome
 position and line number in the legend file, are organized in a table.
 
@@ -24,42 +24,6 @@
 except ModuleNotFoundError:
     print('Caution: The module pysam is missing.')
 
-def read_impute2(filename,**kwargs):
-    """ Reads an IMPUTE2 file format (LEGEND/HAPLOTYPE/SAMPLE) and builds a list
-        of lists, containing the dataset. """
-
-    filetype = kwargs.get('filetype', None)
-
-    def leg_format(line):
-        rs_id, pos, ref, alt = line.strip().split()
-        return leg_tuple('chr'+rs_id[:2].rstrip(':'), int(pos), ref, alt)
-
-    def sam_format(line):
-          sample_id, group1, group2, sex = line.strip().split(' ')
-          return sam_tuple(sample_id, group1, group2, int(sex))
-
-    with (gzip.open(filename,'rt') if filename[-3:]=='.gz' else open(filename, 'r')) as impute2_in:
-        if filetype == 'leg':
-            impute2_in.readline()   # Bite off the header
-            result = tuple(map(leg_format,impute2_in))
-
-        elif filetype == 'hap':
-            firstline = impute2_in.readline()   # Get first line
-            a0 = int(firstline.replace(' ', ''), 2)
-            a1 = (int(line.replace(' ', ''), 2) for line in impute2_in)
-            hap_tab = (a0, *a1)
-            number_of_haplotypes = len(firstline.strip().split())
-            result = hap_tab, number_of_haplotypes
-
-        elif filetype == 'sam':
-            impute2_in.readline()   # Bite off the header
-            result = tuple(map(sam_format,impute2_in))
-
-        else:
-            result = tuple(line.strip().split() for line in impute2_in)
-
-    return result
-
 def save_obs(obs_tab,info,compress,bam_filename,output_filename,output_dir):
     """ Saves the observations table together with information about
         the chromosome number, depth of coverage, and flags that were used.
@@ -92,7 +56,12 @@ def retrive_bases(bam_filename,legend_filename,fasta_filename,handle_multiple_ob
     try:
         genome_reference = pysam.FastaFile(fasta_filename) if fasta_filename!='' else None
         samfile = pysam.AlignmentFile(bam_filename, 'rb' )
-        leg_tab = read_impute2(legend_filename, filetype='leg')
+
+        load = lambda filename: {'bz2': bz2.open, 'gz': gzip.open}.get(filename.rsplit('.',1)[1], open)  #Adjusts the opening method according to the file extension. 
+        open_leg = load(legend_filename)
+        with open_leg(legend_filename,'rb') as leg_in:
+            leg_tab = pickle.load(leg_in)
+
 
         if next(zip(*leg_tab)).count(leg_tab[0][0])!=len(leg_tab):
             raise Exception('Error: Unsuitable legend file. All SNP positions should refer to the same chr_id.')
@@ -163,9 +132,9 @@ def retrive_bases(bam_filename,legend_filename,fasta_filename,handle_multiple_ob
     parser = argparse.ArgumentParser(
         description='Builds a table of single base observations at known SNP positions.')
     parser.add_argument('bam_filename', metavar='BAM_FILENAME', type=str,
-                        help='BAM file')
+                        help='A BAM file sorted by position.')
     parser.add_argument('legend_filename', metavar='LEG_FILENAME', type=str,
-                        help='IMPUTE2 legend file')
+                        help='A legend file of reference panel.')
     parser.add_argument('-f','--fasta_filename', type=str,metavar='FASTA_FILENAME', default='',
                         help='The faidx-indexed reference file in the FASTA format. '
                              'Supplying a reference file will reduce false SNPs caused by misalignments using the Base Alignment Quality (BAQ) method described in the paper “Improving SNP discovery by base alignment quality”, Heng Li, Bioinformatics, Volume 27, Issue 8.')
diff --git a/MAKE_REF_PANEL.py b/MAKE_REF_PANEL.py
new file mode 100644
index 0000000..51e6939
--- /dev/null
+++ b/MAKE_REF_PANEL.py
@@ -0,0 +1,175 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+MAKE_REF_PANEL
+
+This script creates reference panels for LD-PGTA, using genotype calls in VCF 
+files. The reference panels of LD-PGTA have a similar structure to the IMPUTE2
+format.
+
+Daniel Ariad (daniel@ariad.org)
+AUG 27, 2022
+"""
+
+import sys, os, time, random, argparse, re, pickle, gzip, bz2, collections, itertools
+
+leg_tuple = collections.namedtuple('leg_tuple', ('chr_id', 'pos', 'ref', 'alt')) #Encodes the rows of the legend table
+sam_tuple = collections.namedtuple('sam_tuple', ('sample_id', 'group1', 'group2', 'sex')) #Encodes the rows of the samples table
+obs_tuple = collections.namedtuple('obs_tuple', ('pos', 'read_id', 'base')) #Encodes the rows of the observations table
+    
+try:
+    import pysam
+except ModuleNotFoundError:
+    print('Caution: The module pysam is missing.')
+
+def read_impute2(filename,**kwargs):
+    """ Reads an IMPUTE2 file format (LEGEND/HAPLOTYPE/SAMPLE) and builds a list
+        of lists, containing the dataset. """
+
+    filetype = kwargs.get('filetype', None)
+
+    def leg_format(line):
+        rs_id, pos, ref, alt = line.strip().split()
+        return leg_tuple('chr'+rs_id[:2].rstrip(':'), int(pos), ref, alt)
+
+    def sam_format(line):
+          sample_id, group1, group2, sex = line.strip().split(' ')
+          return sam_tuple(sample_id, group1, group2, int(sex))
+
+    with (gzip.open(filename,'rt') if filename[-3:]=='.gz' else open(filename, 'r')) as impute2_in:
+        if filetype == 'leg':
+            impute2_in.readline()   # Bite off the header
+            result = tuple(map(leg_format,impute2_in))
+
+        elif filetype == 'hap':
+            firstline = impute2_in.readline()   # Get first line
+            a0 = int(firstline.replace(' ', ''), 2)
+            a1 = (int(line.replace(' ', ''), 2) for line in impute2_in)
+            hap_tab = (a0, *a1)
+            number_of_haplotypes = len(firstline.strip().split())
+            result = hap_tab, number_of_haplotypes
+
+        elif filetype == 'sam':
+            impute2_in.readline()   # Bite off the header
+            result = tuple(map(sam_format,impute2_in))
+
+        else:
+            result = tuple(line.strip().split() for line in impute2_in)
+
+    return result
+
+def test_module(impute2_leg_filename, impute2_hap_filename, legend, haplotypes):
+    """ Compares the IMPUTE2 reference panels to LD-PGTA reference panels. """
+    impute2_leg = read_impute2(impute2_leg_filename,filetype='leg')
+    impute2_hap = read_impute2(impute2_hap_filename,filetype='hap')
+    print('Legend:', all(a==b for a,b in zip(impute2_leg,legend)))
+    print('Haplotypes:', all(a==b for a,b in zip(impute2_hap[0],haplotypes[0])))
+    return 0
+
+def build_ref_panel(samp_filename,vcf_filename):
+    """ Builds a reference panel with similar structure to the IMPUTE2 format.
+        The reference panel is encoded for efficient storage and retrieval. """
+    
+    time0 = time.time()
+       
+    def sam_format(line):
+          sample_id, group1, group2, sex = line.strip().split(' ')
+          return sam_tuple(sample_id, group1, group2, int(sex))
+    
+    with (gzip.open(samp_filename,'rt') if samp_filename[-3:]=='.gz' else open(samp_filename, 'r')) as impute2_in:
+        impute2_in.readline()   # Bite off the header
+        SAMPLES = tuple(map(sam_format,impute2_in))
+    
+    vcf_in = pysam.VariantFile(vcf_filename,'r')  # auto-detect input format
+    print(vcf_in.description) ### Based on the VCF header, prints a description of the VCF file. 
+
+    SAM = [s.sample_id for s in SAMPLES if s.sample_id in vcf_in.header.samples]
+
+    lenSAM = len(SAM) ### The number of samples that are also included in the VCF.
+    
+    vcf_in.subset_samples(SAM) ### Read only a subset of samples to reduce processing time and memory. Must be called prior to retrieving records.
+
+    HAPLOTYPES = []
+    LEGEND = []
+    
+    for record in vcf_in.fetch():
+        if record.info["VT"]==('SNP',): ### ### Only encode SNPs
+        
+            phased = all((record.samples[sample].phased for sample in SAM))
+            if not phased: continue ### Only encode phased SNPs
+        
+            ALLELES = tuple(itertools.chain.from_iterable((record.samples[sample].allele_indices for sample in SAM)))
+            an = ALLELES.count(1)
+            if an==2*lenSAM or an==0: continue ### Only encode SNPs with a non-zero minor allele count.
+
+            LEGEND.append(leg_tuple('chr'+record.contig, record.pos, *record.alleles)) ### Add the record to the legend list        
+            binary = sum(v<<i for i, v in enumerate(ALLELES[::-1])) ### Encode the alleles as bits
+            HAPLOTYPES.append(binary) ### Add the record to the haplotypes list
+                      
+    time1 = time.time()
+    print('Done building the reference panel in %.3f sec.' % (time1-time0))
+    
+    result = tuple(LEGEND), (tuple(HAPLOTYPES), 2*lenSAM), SAMPLES
+
+    return result
+
+def save_ref_panel(samp_filename, legend, haplotypes, samples):
+    """ Saves the reference panel as a compressed pickle file. """
+    time0 = time.time()
+    base = samp_filename.rsplit('/', 1)[1].rsplit('.', 1)[0]
+    with gzip.open(''.join([legend[0].chr_id,'_',base,'.legend.gz']),'wb') as f:
+        pickle.dump(legend,f)
+    with gzip.open(''.join([legend[0].chr_id,'_',base,'.hap.gz']),'wb') as f:
+        pickle.dump(haplotypes,f)
+    with gzip.open(base+'.samples.gz','wb') as f:
+        pickle.dump(samples,f)
+    time1 = time.time()
+    print('Done saving the reference panel in %.3f sec.' % (time1-time0))
+    return 0
+
+
+def main(samp_filename,vcf_filename):
+    """ Builds and saves the reference panel. """ 
+    legend, haplotypes, samples = build_ref_panel(samp_filename,vcf_filename)
+    save_ref_panel(samp_filename, legend, haplotypes, samples)
+    return 0
+    
+        
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(
+        description='creates reference panels for LD-PGTA, using phased genotypes in VCF files. '
+        'The reference panels of LD-PGTA have a similar structure to the IMPUTE2 format. ')
+    parser.add_argument('samp_filename', metavar='samples_filename', type=str,
+                    help='IMPUTE2 samples file')
+    parser.add_argument('vcf_filename', metavar='vcf_filename', type=str,
+                        help='IMPUTE2 legend file')
+
+    args = parser.parse_args()
+    sys.exit(main(**vars(args)))
+else:
+    print('The module MAKE_REF_PANEL was imported.')
+
+"""
+if __name__ == "__main__": 
+    SP ='ALL'
+    samp_filename = f'/home/ariad/Dropbox/postdoc_JHU/Project1_LD-PGTA/LD-PGTA_ecosystem/build_reference_panel/samples_per_panel/{SP:s}_panel.samples'
+    for i in ['X',*range(22,0,-1)]:    
+        #vcf_filename = f'/home/ariad/Dropbox/postdoc_JHU/Project1_LD-PGTA/LD-PGTA_ecosystem/vcf_phase3_hg38_v2/ALL.chr{i}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz'
+        #legend, haplotypes, samples = build_ref_panel(samp_filename,vcf_filename)
+        #save_ref_panel(legend, haplotypes, samples)
+        
+        impute2_leg_filename = f'/home/ariad/Dropbox/postdoc_JHU/Project1_LD-PGTA/LD-PGTA_ecosystem/build_reference_panel/{SP:s}_panel.hg38.BCFtools/chr{i}_{SP:s}_panel.legend.gz'
+        impute2_hap_filename = f'/home/ariad/Dropbox/postdoc_JHU/Project1_LD-PGTA/LD-PGTA_ecosystem/build_reference_panel/{SP:s}_panel.hg38.BCFtools/chr{i}_{SP:s}_panel.hap.gz'
+        
+        impute2_leg = read_impute2(impute2_leg_filename,filetype='leg')
+        impute2_hap = read_impute2(impute2_hap_filename,filetype='hap')
+        impute2_sam = read_impute2(samp_filename,filetype='sam')
+        save_ref_panel(impute2_leg, impute2_hap, impute2_sam)
+        
+        
+        #print(test_module(impute2_leg_filename, impute2_hap_filename, legend, haplotypes))
+"""        
+    
+    
diff --git a/MIX_HAPLOIDS.py b/MIX_HAPLOIDS.py
index a22998e..8ab99c2 100644
--- a/MIX_HAPLOIDS.py
+++ b/MIX_HAPLOIDS.py
@@ -3,12 +3,12 @@
 """
 
 Simulates observed bases at known SNP positions from an aneuploid cell, based
-on mixtures of haploid sequences. 
+on mixtures of haploid sequences.
 
 The simulation supports three scenarios: (a) trisomy with two matched haplotypes
 (‘single parental homolog’; SPH), (b) three unmatched haplotypes (‘both parental
 homologs’; BPH), (c) trisomy with recombination that is charecterized by a
-transition between SPH to BPH along the chromosome.   
+transition between SPH to BPH along the chromosome.
 
 MIX_HAPLOIDS
 
@@ -25,11 +25,11 @@
 
 obs_tuple = collections.namedtuple('obs_tuple', ('pos', 'read_id', 'base')) #Encodes the rows of the observations table
 
-class Formatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter): 
+class Formatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):
     pass
 
 def chr_length(chr_id):
-    """ Return the chromosome length for a given chromosome, based on the reference genome hg38.""" 
+    """ Return the chromosome length for a given chromosome, based on the reference genome hg38."""
     #The data of chromosome length was taken from https://www.ncbi.nlm.nih.gov/grc/human/data?asm=GRCh38
     length_dict = {'chr1': 248956422, 'chr2': 242193529, 'chr3': 198295559, 'chr4': 190214555, 'chr5': 181538259,
                   'chr6': 170805979, 'chr7': 159345973, 'chr8': 145138636, 'chr9': 138394717, 'chr10': 133797422,
@@ -42,19 +42,19 @@ def number_of_reads(chr_id,reads_length,depth):
     """ Calculates the the number of reads for a given coverage depth and reads length."""
     number_of_fragments = depth * chr_length(chr_id) // reads_length
     return int(number_of_fragments)
-   
+
 def inBPHregion(x,transitions):
     """ Checks if x is in a BPH region. """
-    l = sorted(transitions[1:]+(x,)).index(x) 
+    l = sorted(transitions[1:]+(x,)).index(x)
     return (transitions[0]=='BPH') ^ (l%2)
-      
+
 def build_obs_tab(obs_dicts, chr_id, read_length, depth, scenario, transitions):
     """ Mixes reads of DNA sequencing to simulate various aneuploidy landscapes
-        in non-admixtures as well as F1-admixtures. """ 
-    
+        in non-admixtures as well as F1-admixtures. """
+
     num_of_reads = number_of_reads(chr_id,read_length,depth)
     L = len(obs_dicts)
-        
+
     obs_tab = list()
     dx, odd = divmod(read_length, 2)
     for i in range(num_of_reads):
@@ -67,33 +67,33 @@ def build_obs_tab(obs_dicts, chr_id, read_length, depth, scenario, transitions):
         elif scenario=='SPH':
             W = [2,1] + [0] * (L - 2)
         elif scenario=='BPH':
-            W = [1,1,1] + [0] * (L - 3) 
+            W = [1,1,1] + [0] * (L - 3)
         elif scenario=='transitions':
             ####(BPH,0.3,0.7)
             x = inBPHregion(p/chr_length(chr_id),transitions)
             ####print(p/chr_length(chr_id),transitions,x)
             W = [1,1,1] + [0] * (L - 3) if x else [2,1,] + [0] * (L - 2)
         else:
-            raise Exception('error: undefined scenario.')   
-           
+            raise Exception('error: undefined scenario.')
+
         rnd = choices(range(len(W)), weights=W, k=1)[0]
-        reads_id = '%d.%d.%s.%d' % (read_boundaries[0],read_boundaries[1]-1,chr(65+rnd),i) 
-        
+        reads_id = '%d.%d.%s.%d' % (read_boundaries[0],read_boundaries[1]-1,chr(65+rnd),i)
+
         obs_tab.extend(obs_tuple(pos, reads_id, obs_dicts[rnd][pos]) for pos in range(*read_boundaries) if pos in obs_dicts[rnd])
         #### obs_dicts[rnd][pos] is the observed base
     obs_tab.sort(key=itemgetter(0))
-        
+
     return obs_tab
 
 def build_obs_tab_complex(obs_dicts, chr_id, read_length, depth, scenario):
-    """ Mixes reads of DNA sequencing to simulate various aneuploidy landscapes in complex admixtures. """ 
-    
+    """ Mixes reads of DNA sequencing to simulate various aneuploidy landscapes in complex admixtures. """
+
     num_of_reads = number_of_reads(chr_id,read_length,depth)
     regions = 40
-    proportions = (1,1)    
+    proportions = (1,1)
     obs_tab = list()
     dx, odd = divmod(read_length, 2)
-    
+
     number_of_haplotypes = {'monosomy':1, 'disomy':2, 'BPH': 3, 'SPH': 3}
     rsize = chr_length(chr_id)//regions
     if scenario=='SPH':
@@ -102,29 +102,29 @@ def build_obs_tab_complex(obs_dicts, chr_id, read_length, depth, scenario):
                               for i in range(regions)}
     elif scenario in {'monosomy', 'disomy', 'BPH'}:
         configurations = {(i*rsize,(i+1)*rsize):
-                              sum(choices([[1,0],[0,1]], weights=proportions, k=number_of_haplotypes[scenario]),start=[]) 
+                              sum(choices([[1,0],[0,1]], weights=proportions, k=number_of_haplotypes[scenario]),start=[])
                                   for i in range(regions)}
     else:
-        raise Exception('error: undefined scenario.')   
-        
+        raise Exception('error: undefined scenario.')
+
     for (start,stop),W in configurations.items():
         for k in range(num_of_reads//regions):
             p = randrange(start,stop) + 1
-            read_boundaries = (p-dx,p+dx+odd)     
+            read_boundaries = (p-dx,p+dx+odd)
             rnd = choices(range(len(W)), weights=W, k=1)[0]
-            reads_id = '%d.%d.%s.%d' % (read_boundaries[0],read_boundaries[1]-1,chr(65+rnd),k) 
-            
+            reads_id = '%d.%d.%s.%d' % (read_boundaries[0],read_boundaries[1]-1,chr(65+rnd),k)
+
             obs_tab.extend(obs_tuple(pos, reads_id, obs_dicts[rnd][pos]) for pos in range(*read_boundaries) if pos in obs_dicts[rnd])
             #### obs_dicts[rnd][pos] is the observed base
     obs_tab.sort(key=itemgetter(0))
-        
+
     return obs_tab
 
 def senarios_iter(scenarios, list_of_transitions):
     """ Iterates over the different scenarios, taking into account a batch of
         transitions. """
-    
-    for s in scenarios: 
+
+    for s in scenarios:
         if s=='transitions':
             for b in list_of_transitions:
                 yield s,b
@@ -132,77 +132,77 @@ def senarios_iter(scenarios, list_of_transitions):
             yield s, None
 
 def save_results(obs_tab,info,ind,transitions,given_output_filename,output_dir):
-    """ Saves the simulated observation table togther with the 
+    """ Saves the simulated observation table togther with the
         supplementary information. """
-    
+
     suffix = f'.{ind:d}' if ind else ''
     T = '.trans_'+transitions[0]+'_'+'_'.join(f'{i:.1f}' for i in transitions[1:]) if info['scenario']=='transitions' else ''
-    
+
     default_output_filename = f"simulated.{info['scenario']:s}.{info['chr_id']:s}.x{info['depth']:.3f}.{'.'.join(info['sample_ids']):s}{T:s}.obs.p"
     output_filename = default_output_filename if given_output_filename=='' else given_output_filename.rsplit('/', 1).pop()+suffix
-    
+
     output_dir += '/' if output_dir[-1:]!='/' else ''
     if output_dir!='' and not os.path.exists(output_dir): os.makedirs(output_dir)
-    
+
     with open(  output_dir + output_filename , 'wb' ) as f:
             dump(obs_tab, f, protocol=4)
-            dump(info, f, protocol=4)    
-    
+            dump(info, f, protocol=4)
+
     return output_dir + output_filename
-    
+
 def MixHaploids(obs_filenames, read_length, depth, scenarios, **kwargs):
     """ Given N observation tables of haploid sequences, an observation
-        table that depicts a chromosomal aneuploidy is created. """    
-        
+        table that depicts a chromosomal aneuploidy is created. """
+
     time0 = time()
-    seed(a=None, version=2) #I should set a=None after finishing to debug the code.        
-    
+    seed(a=None, version=2) #I should set a=None after finishing to debug the code.
+
     list_of_transitions = kwargs.get('transitions', [])
     given_output_filename = kwargs.get('output_filename','')
     output_dir = kwargs.get('output_dir', 'results')
     complex_admixture = kwargs.get('complex_admixture', False)
 
-    
+
     obs_dicts, info_dicts = [], []
     for filename in obs_filenames:
         with open(filename, 'rb') as f:
             obs_dict = {pos: obs_base for (pos, reads_id, obs_base) in load(f)}
             obs_dicts.append(obs_dict)
             info_dicts.append(load(f))
-        
-    chr_id = info_dicts[0]['chr_id'] 
+
+    chr_id = info_dicts[0]['chr_id']
 
     if not all(info['chr_id']==chr_id for info in info_dicts):
-        raise Exception('Error: the chr_id differs from one OBS file to another.')   
-    
-    
+        raise Exception('Error: the chr_id differs from one OBS file to another.')
+
+
     number_of_required_obs_files = {'monosomy': 1, 'disomy': 2, 'SPH': 2, 'BPH': 3, 'transitions': 3}
     output_filenames = []
-    
+
     cases = tuple(senarios_iter(scenarios, list_of_transitions))
-    
-    
+
+
     if complex_admixture:
         print('mode: complex admixture.')
     else:
         print('mode: normal.')
-    
-    for ind, (scenario, transitions) in enumerate(cases, start=1): 
-        
+
+    for ind, (scenario, transitions) in enumerate(cases, start=1):
+
         if len(obs_filenames) < number_of_required_obs_files[scenario] * (1+complex_admixture):
-            raise Exception(f'error: The {scenario:s} scenario requires at least {number_of_required_obs_files[scenario]*(1+complex_admixture):d} observation files.') 
-            
+            raise Exception(f'error: The {scenario:s} scenario requires at least {number_of_required_obs_files[scenario]*(1+complex_admixture):d} observation files.')
+
         if complex_admixture and scenario!='transitions':
             obs_tab = build_obs_tab_complex(obs_dicts, chr_id, read_length, depth, scenario)
         elif complex_admixture and scenario=='transitions':
-            raise Exception('error: transitions are not supported in complex admixtures.') 
+            raise Exception('error: transitions are not supported in complex admixtures.')
         else:
             obs_tab = build_obs_tab(obs_dicts, chr_id, read_length, depth, scenario, transitions)
-        
+
         sample_ids = [info_dicts[i].get('sample_id',obs_filenames[i].strip().rsplit('/',1).pop()[:-6])
                           + info_dicts[i].get('haplotype','')
-                                   for i in range(number_of_required_obs_files[scenario])] 
-        
+                                   for i in range(number_of_required_obs_files[scenario])]
+
         info = {'chr_id': chr_id,
                 'depth': depth,
                 'read_length': read_length,
@@ -211,13 +211,13 @@ def MixHaploids(obs_filenames, read_length, depth, scenarios, **kwargs):
                 'sample_ids': sample_ids,
                 'handle-multiple-observations': 'all',
                 'complex': complex_admixture}
-        
+
         if given_output_filename!=None:
             fn = save_results(obs_tab,info,ind,transitions,given_output_filename,output_dir)
             output_filenames.append(fn)
-        
+
         sys.stdout.write(f"\r[{'=' * int(ind):{len(cases)}s}] {int(100*ind/len(cases))}% "); sys.stdout.flush()
-        
+
     time1 = time()
     print(f'\nDone simulating the observations table of a trisomic cell in {time1-time0:.2f} sec.')
     return output_filenames
@@ -225,7 +225,7 @@ def MixHaploids(obs_filenames, read_length, depth, scenarios, **kwargs):
 def MixHaploids_wrapper(*obs_filenames, read_length, depth, scenarios, **kwargs):
     return MixHaploids(obs_filenames, read_length, depth, scenarios, **kwargs)
 
-if __name__ == "__main__":       
+if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description="Simulates an observation table of various aneuploidy landscapes:\n"
                     "\t(1) Monosomy - a single copy of a chromosome pair.\n"
@@ -234,22 +234,22 @@ def MixHaploids_wrapper(*obs_filenames, read_length, depth, scenarios, **kwargs)
                     "\t(4) BPH (`both parental homologs') - trisomy with three unmatched haplotypes.\n"
                     "\t(5) Meiotic trisomy - trisomy with recombination, charecterized by a transition between SPH to BPH.\n", formatter_class=Formatter)
     parser.add_argument('obs_filenames', metavar='OBS_FILENAME', type=str, nargs='+',
-                        help='Pickle files created by SIMULATE_HAPLOIDS, containing base observations at known SNP positions. '
+                        help='Pickle files created by EXTRACT_GENOTYPES, containing base observations at known SNP positions. '
                              'When simulating SPH, the first given haploid would serve as the duplicated homolog.')
-    parser.add_argument('-d', '--depth', type=float, 
-                        metavar='FLOAT', default=0.1, 
+    parser.add_argument('-d', '--depth', type=float,
+                        metavar='FLOAT', default=0.1,
                         help='The average coverage for the whole chromosome.  Default value 0.1')
-    parser.add_argument('-l', '--read-length', type=int, 
+    parser.add_argument('-l', '--read-length', type=int,
                         metavar='INT', default=36,
                         help='The number of base pairs (bp) sequenced from a DNA fragment. Default value 36.')
-    parser.add_argument('-s', '--scenarios', type=str, nargs='+', 
+    parser.add_argument('-s', '--scenarios', type=str, nargs='+',
                         metavar='monosomy/disomy/SPH/BPH/transitions', default='disomy', choices=['monosomy','disomy','SPH','BPH','transitions'],
                         help="The simulation supports five scenarios: monosomy/disomy/SPH/BPH/transitions. Default scenario is disomy."
                              "Giving a list of scenarios, e.g. \"SPH BPH\" would create a batch of simulations.")
     parser.add_argument('-t', '--transitions', type=str, nargs='+', metavar='STR,FLOAT,...,FLOAT',
                         help='Relevant only for the transitions scenario. Introduces transitions between SPH and BPH along the chromosome. '
                              'The locations of the transition is determined by a fraction of chromosome length, ranging between 0 to 1. '
-                             'For example a BPH-SPH-BPH transition that equally divides the chromosomes is exressed as BPH,0.333,0.666 and,'  
+                             'For example a BPH-SPH-BPH transition that equally divides the chromosomes is exressed as BPH,0.333,0.666 and,'
                              'similarly, a SPH-BPH transition at the middle of the chromosome is expressed as SPH,0.5. '
                              'In addition, giving a list of cases, e.g. \"SPH,0.2 SPH,0.4 SPH,0.6\" would create a batch of three simulations. ')
     parser.add_argument('-o', '--output-filename', metavar='OUTPUT_FILENAME', type=str,
@@ -262,9 +262,9 @@ def MixHaploids_wrapper(*obs_filenames, read_length, depth, scenarios, **kwargs)
                              'For example, in order to simulate a SPH case the observation tables should be given '
                              'as follows: \"python MIX_HAPLOIDS -s SPH HAPLOID1_AFR.obs.p HAPLOID2_EUR.obs.p HAPLOID3_AFR.obs.p HAPLOID4_EUR.obs.p\". '
                              'When simulating SPH, the first two observation tables would be associated with the duplicated homolog.'
-                             )    
-    
+                             )
+
     kwargs = vars(parser.parse_args())
     kwargs['transitions'] = [[float(i) if i.isdigit() else i for i in j.split(',')] for j in kwargs.get('transitions','')]
-    MixHaploids(**kwargs)    
+    MixHaploids(**kwargs)
     sys.exit(0)