In [1]:
import pandas as pd
import os
import gzip

In [2]:
def reformat(vcf_file, pop, suffix='formatted.tsv', genotypes=False):
    
    out_tsv = '.'.join(os.path.basename(vcf_file).split('.')[0:-3] + [suffix])
    out_dir = os.path.dirname(vcf_file) + '/'
    out = open(out_dir + out_tsv, 'w')

    out_header = ['POS', 'A1', 'A2', 'POP1', 'POP2']
    out.write('\t'.join(out_header) +'\n')
    
    with gzip.open(vcf_file, 'rb') as f:
        for i in f:
            i = i.rstrip()
            i = i.decode()
            #print(i)
            if not i.startswith("#"):
                #print(i.split('\t'))
                # VCF Header: #CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  HG01879
                # Sample info example: 0|0
                
                (chr, pos, id_field, a2, a1, qual, filter_field, info, format_field, gt) = i.split('\t')
                
                # Set population of both chromosome copies as the ancestry of the patient sample
                if pop == "AFR":
                    pop1 = "0"
                    pop2 = "0"
                elif pop == "EUR":
                    pop1 = "1"
                    pop2 = "1"
                    
                # Get the genotypes of the individuals
                if gt[0] == 0:
                    g1 = a2
                else:
                    g1 = a1
                if gt[-1] == 0:
                    g2 = a2
                else:
                    g2 = a1
                
                # Output File: POS A1 A2 POP1 POP2
                # Only print out SNPs with length 1 for ref and alt
                if (len(a2) == 1) and (len(a1) ==1):
                    if genotypes:
                        out.write('\t'.join([pos, g1, g2, pop1, pop2]) + '\n')
                    else:
                        out.write('\t'.join([pos, a1, a2, pop1, pop2]) + '\n')
                

## Extract a single EUR and single AFR sample from the VCF

### Chromosome 21

In [11]:
# Get list of 1000 Genome samples in the AFR and EUR populations
#chr21_dir="/home/lbruce/project_files/chromosome_21_files/"
chr21_dir="/home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_21_files/"

# AFR Labels from ps2_reference_labels.csv
afr_samples = pd.read_csv(chr21_dir + "AFR_labels.csv", header=None)[0].tolist()
# EUR Labels from ps2_reference_labels.csv
eur_samples = pd.read_csv(chr21_dir + "EUR_labels.csv", header=None)[0].tolist()

chr21_vcf = chr21_dir + "ALL.chr21.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz"

In [12]:
# Get List of column indexes in the vcf file for the two populations
# VCF file: ALL.chr21.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz 
# VCF '#CHROM' line file: chr21_vcf_header.txt
chr21_header = pd.read_csv(chr21_dir + "chr21_vcf_header.txt", sep='\t', header=None).values.tolist()[0]

# Get sample names for the two populations
afr_columns = []
eur_columns = []
for idx,c in enumerate(chr21_header):
    if c in afr_samples:
        afr_columns.append(c)
    elif c in eur_samples:
        eur_columns.append(c)
print(len(afr_columns), print(len(eur_columns)))
#print(afr_columns, eur_columns)

# Create bcftools command to extract samples by ancestry population
# command  bcftools view -Ou -s sample1,sample2 file.vcf
#print(afr_columns, eur_columns)
afr_columns_formatted = ','.join(afr_columns)
eur_columns_formatted = ','.join(eur_columns)

# Executed bcftools commands in terminal but could also use os.system()
chr21_afr_vcf = chr21_dir + afr_columns[0] + ".AFR.chr21.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz"
# Extract first AFR sample
command="bcftools view -s {0} {1} | bgzip -c > {2} && tabix -s1 -b2 -e2 {2}".format(afr_columns[0], chr21_vcf, chr21_afr_vcf)
#os.system(command)
print(command)

# Extract NA12878 EUR sample
eur_sample="NA12878"
chr21_eur_vcf = chr21_dir + eur_sample+ ".EUR.chr21.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz"
command2="bcftools view -s {0} {1} | bgzip -c > {2} && tabix -s1 -b2 -e2 {2}".format(eur_sample, chr21_vcf, chr21_eur_vcf)
#os.system(command2)
print(command2)

502
560 None
bcftools view -s HG01879 /home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_21_files/ALL.chr21.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz | bgzip -c > /home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_21_files/HG01879.AFR.chr21.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz && tabix -s1 -b2 -e2 /home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_21_files/HG01879.AFR.chr21.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz
bcftools view -s NA12878 /home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_21_files/ALL.chr21.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz | bgzip -c > /home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_21_files/NA12878.EUR.chr21.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz && tabix -s1 -b2 -e2 /home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_21_files/NA12878.EUR.chr21.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz


In [14]:
print(chr21_afr_vcf)

/home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_21_files/HG01879.AFR.chr21.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz


#### Reformat into POS Ref Alt POP1 POP2

In [22]:
reformat(chr21_afr_vcf, "AFR")

In [23]:
reformat(chr21_eur_vcf, "EUR")

#### Reformat with genotypes rather than ref/alt

In [10]:
chr21_afr_vcf = "../../chromosome_21_files/HG01879.AFR.chr21.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz"
reformat(chr21_afr_vcf, "AFR", 'formatted_geno.tsv', genotypes=True)

In [12]:
chr21_eur_vcf = "../../chromosome_21_files/NA12878.EUR.chr21.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz"
reformat(chr21_eur_vcf, "EUR", 'formatted_geno.tsv', genotypes=True)

### Chromosome 14

In [4]:
# Get list of 1000 Genome samples in the AFR and EUR populations
chr14_dir="/home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_14_files/"

# AFR Labels from ps2_reference_labels.csv
afr_samples = pd.read_csv(chr14_dir + "AFR_labels.csv", header=None)[0].tolist()
# EUR Labels from ps2_reference_labels.csv
eur_samples = pd.read_csv(chr14_dir + "EUR_labels.csv", header=None)[0].tolist()

chr14_vcf = chr14_dir + "ALL.chr14.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz"

In [10]:
# Get List of column indexes in the vcf file for the two populations
# VCF file: ALL.chr14.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz 
# VCF '#CHROM' line file: chr14_vcf_header.txt
chr14_header = pd.read_csv(chr14_dir + "chr14_vcf_header.txt", sep='\t', header=None).values.tolist()[0]

# Get sample names for the two populations
afr_columns = []
eur_columns = []
for idx,c in enumerate(chr14_header):
    if c in afr_samples:
        afr_columns.append(c)
    elif c in eur_samples:
        eur_columns.append(c)
print(len(afr_columns), print(len(eur_columns)))
#print(afr_columns, eur_columns)

# Create bcftools command to extract samples by ancestry population
# command  bcftools view -Ou -s sample1,sample2 file.vcf
#print(afr_columns, eur_columns)
afr_columns_formatted = ','.join(afr_columns)
eur_columns_formatted = ','.join(eur_columns)

# Executed bcftools commands in terminal but could also use os.system()
chr14_afr_vcf = chr14_dir + afr_columns[0] + ".AFR.chr14.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz"
# Extract first AFR sample
command="bcftools view -s {0} {1} | bgzip -c > {2} && tabix -s1 -b2 -e2 {2}".format(afr_columns[0], chr14_vcf, chr14_afr_vcf)
#os.system(command)
print(command)

# Extract NA12878 EUR sample
eur_sample="NA12878"
chr14_eur_vcf = chr14_dir + eur_sample+ ".EUR.chr14.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz"
command2="bcftools view -s {0} {1} | bgzip -c > {2} && tabix -s1 -b2 -e2 {2}".format(eur_sample, chr14_vcf, chr14_eur_vcf)
#os.system(command2)
print(command2)

502
560 None
bcftools view -s HG01879 /home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_14_files/ALL.chr14.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz | bgzip -c > /home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_14_files/HG01879.AFR.chr14.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz && tabix -s1 -b2 -e2 /home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_14_files/HG01879.AFR.chr14.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz
bcftools view -s NA12878 /home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_14_files/ALL.chr14.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz | bgzip -c > /home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_14_files/NA12878.EUR.chr14.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz && tabix -s1 -b2 -e2 /home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_14_files/NA12878.EUR.chr14.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz


In [15]:
print(chr14_afr_vcf)

/home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_14_files/HG01879.AFR.chr14.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz


#### Reformat into POS Ref Alt POP1 POP2

In [24]:
reformat(chr14_afr_vcf, "AFR")

In [25]:
reformat(chr14_eur_vcf, "EUR")

#### reformat with genotypes rather than ref/alt

In [8]:
chr14_afr_vcf = "../../chromosome_14_files/HG01879.AFR.chr14.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz"
reformat(chr14_afr_vcf, "AFR", "formatted_geno.tsv", genotypes=True)

In [9]:
chr14_eur_vcf = "../../chromosome_14_files/NA12878.EUR.chr14.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz"
reformat(chr14_eur_vcf, "EUR", "formatted_geno.tsv", genotypes=True)