In [2]:
import subprocess

import pandas as pd

In [79]:
import time
def str_time():
    return time.strftime("%X %x %Z", time.gmtime())

In [46]:
PLINK_EXEC_PATH = "../plink_linux_x86_64_20220402/plink"
BEAGLE_JAR_PATH = "../beagle/beagle.22Jul22.46e.jar"

In [6]:
# getting a list of all patients, copied from diabetes_logreg.ipynb

smallest_vcf_location = "../biobank_extracted/extracted_snps_c6_w500.vcf"

r = pd.read_csv(
    smallest_vcf_location, 
    skiprows=range(6), 
    #usecols=['5825360_5825360', '5467150_5467150'],
    sep='\t',
    chunksize=1
)

one_row = next(r)

vcf_all_eids = list({int(s[:len(s)//2]) for s in one_row.columns[9:]})
#vcf_all_eids = list(one_row.columns[9:])
n_patients = len(vcf_all_eids)
len(vcf_all_eids)

488377

In [7]:
n_splits = 1000
patient_splits = [vcf_all_eids[i*n_patients//n_splits : (i+1)*n_patients//n_splits]
                  for i in range(n_splits)]

In [None]:
print(f'Started chrom 22 at {str_time()}')

'''ref_vcf_filename = ('../1000genomes/ftp.1000genomes.ebi.ac.uk/vol1/ftp/'
                    'data_collections/1000G_2504_high_coverage/working/'
                    '20220422_3202_phased_SNV_INDEL_SV/'
                    f'1kGP_high_coverage_Illumina.chr{22}.filtered.SNV_INDEL_SV_phased_panel.vcf.gz')
ref_vcf_gunzipped_filename = f'../imputed_beagle/1kGPref.chr{22}.vcf'
ref_vcf_recoded_filename = f'../imputed_beagle/1kGPref.chr{22}.nochrpref.vcf'
# gunzip vcf.gz, delete chr prefix in chromosome names, and gzip it back
with open(ref_vcf_gunzipped_filename, 'w') as ref_vcf_gunzipped_handle:
    subprocess.run(['gunzip', '-kc', ref_vcf_filename],
                   stdout=ref_vcf_gunzipped_handle)
with open(ref_vcf_recoded_filename, 'w') as ref_vcf_recoded_handle:
    # !awk '{gsub(/^chr/,""); print}' $ref_vcf_filename > '$ref_vcf_recoded_filename'
    subprocess.run(['awk', '{gsub(/^chr/,""); print}', ref_vcf_gunzipped_filename], 
                   stdout=ref_vcf_recoded_handle)
subprocess.run(['gzip', '-f', ref_vcf_recoded_filename])  

print(f'Recoded chrom 22 VCF at {str_time()}')'''
    
for split_number, patient_split in enumerate(patient_splits):
    print(f'Started split {split_number} at {str_time()}')
    
    keep_list_filename = 'keep_list_tmp'
    with open(keep_list_filename, 'w') as keep_list_handle:
        for patient_id in patient_split:
            print(patient_id, patient_id, sep='\t', file=keep_list_handle)
            
            
    # cutting into splits
    split_vcf_prefix = f'../imputed_beagle/c22_split{split_number}'
    proc = subprocess.run([PLINK_EXEC_PATH,
                           '--bfile', '../ukbiobank_link/raw_data/ukb22418_c22_b0_v2', 
                           '--keep', keep_list_filename,
                           '--recode', 'vcf', 'bgz',
                           '--out', split_vcf_prefix
                          ], 
                          capture_output=True)
    
    print(f'Finished splitting split {split_number} at {str_time()}')
    
    # imputing each
    imputted_split_vcf_prefix = f'../imputed_beagle/c22_split{split_number}_imputted'
    subprocess.run(['java', f'-Xmx{50}g', '-jar', BEAGLE_JAR_PATH,
                    f'gt={split_vcf_prefix}.vcf.gz', 
                    f'ref={ref_vcf_recoded_filename}.gz',
                    f'out={imputted_split_vcf_prefix}', 
                    f'map=../beagle/plink_GRCh38_map/plink.chr{22}.GRCh38.map',
                    f'nthreads={24}', 
                    #f'window={8}',
                   ],
                  capture_output=True)
    
    print(f'Finished imputing split {split_number} at {str_time()}')
    
    #subprocess.run(['rm', f'{split_vcf_prefix}.*'])
    
    

Started chrom 22 at 17:27:31 12/29/22 GMT
Started split 0 at 17:27:31 12/29/22 GMT
Finished splitting split 0 at 17:27:34 12/29/22 GMT
Finished imputing split 0 at 17:31:34 12/29/22 GMT
Started split 1 at 17:31:34 12/29/22 GMT
Finished splitting split 1 at 17:31:37 12/29/22 GMT
Finished imputing split 1 at 17:35:09 12/29/22 GMT
Started split 2 at 17:35:09 12/29/22 GMT
Finished splitting split 2 at 17:35:11 12/29/22 GMT
Finished imputing split 2 at 17:39:54 12/29/22 GMT
Started split 3 at 17:39:54 12/29/22 GMT
Finished splitting split 3 at 17:39:56 12/29/22 GMT
Finished imputing split 3 at 17:44:51 12/29/22 GMT
Started split 4 at 17:44:51 12/29/22 GMT
Finished splitting split 4 at 17:44:53 12/29/22 GMT
Finished imputing split 4 at 17:49:01 12/29/22 GMT
Started split 5 at 17:49:01 12/29/22 GMT
Finished splitting split 5 at 17:49:04 12/29/22 GMT
Finished imputing split 5 at 17:54:29 12/29/22 GMT
Started split 6 at 17:54:29 12/29/22 GMT
Finished splitting split 6 at 17:54:32 12/29/22 GMT
F

In [21]:
!ls ../ukbiobank_link/raw_data/*c22*

../ukbiobank_link/raw_data/ukb22418_c22_b0_v2.bed
../ukbiobank_link/raw_data/ukb22418_c22_b0_v2.bim
../ukbiobank_link/raw_data/ukb22418_c22_b0_v2.fam
../ukbiobank_link/raw_data/ukb22418_c22_b0_v2.log
../ukbiobank_link/raw_data/ukb22418_c22_b0_v2.nosex
../ukbiobank_link/raw_data/ukb22418_c22_b0_v2_s488195.fam
../ukbiobank_link/raw_data/ukb22418_c22_b0_v2.vcf


In [None]:
'''ukb chromosomes have a format of a lone number, while 1000g vcfs
also have 'chr' prefix
'''