In [2]:
import os
import pandas as pd
import numpy as np

In [4]:
# Get list of 1000 Genome samples in the AFR and EUR populations
chr14_dir="/home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_14_files/"

# AFR Labels from ps2_reference_labels.csv
afr_samples = pd.read_csv(chr14_dir + "AFR_labels.csv", header=None)[0].tolist()
# EUR Labels from ps2_reference_labels.csv
eur_samples = pd.read_csv(chr14_dir + "EUR_labels.csv", header=None)[0].tolist()

chr14_vcf = chr14_dir + "ALL.chr14.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz"

In [5]:
%%bash
# Extracted chr pos rsid to map to plink output
# Ran in terminal
# MB File
#zcat /home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_14_files/ALL.chr14.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz | grep -v '#' | awk -F'\t' '{print $1"\t"$2"\t"$3}' > /home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_14_files/chr14_chr_pos_rsid.txt &

In [7]:
chr14_pos_rsid_df=pd.read_csv("/home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_14_files/chr14_chr_pos_rsid.txt", header=None, sep='\t')
chr14_pos_rsid_df.columns=["CHR","POS","SNP"]
chr14_pos_rsid_df.head()

Unnamed: 0,CHR,POS,SNP
0,14,19000017,rs375700886
1,14,19000050,rs543746158
2,14,19000056,rs561973970
3,14,19000059,rs201622908
4,14,19000060,rs28973059


In [10]:
# Get List of column indexes in the vcf file for the two populations
# VCF file: ALL.chr14.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz 
# VCF '#CHROM' line file: chr14_vcf_header.txt
chr14_header = pd.read_csv(chr14_dir + "chr14_vcf_header.txt", sep='\t', header=None).values.tolist()[0]

# Get sample names for the two populations
afr_columns = []
eur_columns = []
for idx,c in enumerate(chr14_header):
    if c in afr_samples:
        afr_columns.append(c)
    elif c in eur_samples:
        eur_columns.append(c)
print(len(afr_columns), print(len(eur_columns)))

# Create bcftools command to extract samples by ancestry population
# command  bcftools view -Ou -s sample1,sample2 file.vcf
#print(afr_columns, eur_columns)
afr_columns_formatted = ','.join(afr_columns)
eur_columns_formatted = ','.join(eur_columns)

# Executed bcftools commands in terminal but could also use os.system()
afr_vcf = chr14_dir + "AFR.chr14.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz"
#print("bcftools view -s {0} {1} | bgzip -c > {2} && tabix -s1 -b2 -e2 {2}".format(afr_columns_formatted, chr14_vcf, afr_vcf))

eur_vcf = chr14_dir + "EUR.chr14.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz"
#print("bcftools view -s {0} {1} | bgzip -c > {2} && tabix -s1 -b2 -e2 {2}".format(eur_columns_formatted, chr14_vcf, eur_vcf))

502
560 None


In [13]:
%%bash

# Calculate Minor Allele frequencies for each population
#plink --vcf /home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_14_files/AFR.chr14.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz --freq --out /home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_14_files/afr_genotype_maf
#plink --vcf /home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_14_files/EUR.chr14.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz --freq --out /home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_14_files/eur_genotype_maf

# Get Counts of genotype for each population
#plink --vcf /home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_14_files/AFR.chr14.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz --freqx --out /home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_14_files/afr_genotype_counts
#plink --vcf /home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_14_files/EUR.chr14.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz --freqx --out /home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_14_files/eur_genotype_counts

# Extract population allele frequencies:  
# VCF: 21      9411245 rs181691356     C       A  EAS_AF=0;AMR_AF=0.0014;AFR_AF=0.0008;EUR_AF=0.002;SAS_AF=0
# Plink output run on AFR.vcf: rs181691356    A    C    0.0008929     1120
# MAF calculations match :)

PLINK v1.90b6.9 64-bit (4 Mar 2019)            www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to /home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_14_files/afr_genotype_maf.log.
Options in effect:
  --freq
  --out /home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_14_files/afr_genotype_maf
  --vcf /home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_14_files/AFR.chr14.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz

385381 MB RAM detected; reserving 192690 MB for main workspace.
--vcf: 1k variants complete.--vcf: 2k variants complete.--vcf: 3k variants complete.--vcf: 4k variants complete.--vcf: 5k variants complete.--vcf: 6k variants complete.--vcf: 7k variants complete.--vcf: 8k variants complete.--vcf: 9k variants complete.--vcf: 10k variants complete.--vcf: 11k variants complete.--vcf: 12k variants complete.--vcf: 13k variants complete.--vcf: 14k variants complete.--vcf: 15k va

In [14]:
# Plink format: Forces Major allele = A2, Minor allele = A1

# Extract AFR minor allele freqs
afr_genotype_maf_df = pd.read_csv("/home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_14_files/afr_genotype_maf.frq", delim_whitespace=True)
afr_genotype_maf_df.rename(columns={"MAF":"MAF_AFR"}, inplace=True)
display(afr_genotype_maf_df.head())

# Extract EUR minor allele freqs
eur_genotype_maf_df = pd.read_csv("/home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_14_files/eur_genotype_maf.frq", delim_whitespace=True)
eur_genotype_maf_df.rename(columns={"MAF":"MAF_EUR"}, inplace=True)
display(eur_genotype_maf_df.head())

Unnamed: 0,CHR,SNP,A1,A2,MAF_AFR,NCHROBS
0,14,rs375700886,T,C,0.0,1120
1,14,rs543746158,A,G,0.0,1120
2,14,rs561973970,T,A,0.0,1120
3,14,rs201622908,T,G,0.0,1120
4,14,rs28973059,G,C,0.1188,1120


Unnamed: 0,CHR,SNP,A1,A2,MAF_EUR,NCHROBS
0,14,rs375700886,T,C,0.0,1004
1,14,rs543746158,A,G,0.0,1004
2,14,rs561973970,T,A,0.0,1004
3,14,rs201622908,T,G,0.00498,1004
4,14,rs28973059,G,C,0.3884,1004


In [15]:
# P("TT" at SNPi | AB) = freq(T at SNPi in pop A) * freq(T at SNPi in pop B)
genotype_maf_df = pd.merge(afr_genotype_maf_df, eur_genotype_maf_df, how='inner', on=["CHR", "SNP", "A1", "A2"])
genotype_maf_df.drop(columns=["NCHROBS_x", "NCHROBS_y"], inplace=True)

genotype_maf_df = pd.merge(genotype_maf_df,chr14_pos_rsid_df,how='inner', on=["CHR", "SNP"])
display(genotype_maf_df.shape)


(2605637, 7)

In [16]:
display(genotype_maf_df.head())
display(genotype_maf_df.shape)

Unnamed: 0,CHR,SNP,A1,A2,MAF_AFR,MAF_EUR,POS
0,14,rs375700886,T,C,0.0,0.0,19000017
1,14,rs543746158,A,G,0.0,0.0,19000050
2,14,rs561973970,T,A,0.0,0.0,19000056
3,14,rs201622908,T,G,0.0,0.00498,19000059
4,14,rs28973059,G,C,0.1188,0.3884,19000060


(2605637, 7)

In [17]:
genotype_maf_df.to_csv("/home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_14_files/chr14_genotypes_afr_eur_allelefreqs.csv", header=True, index=False)

## Calculate BP Probabilities for both AFR and EUR Populations
Put in Python Script because it was too much for jupyter notebooks: /home/lbruce/teams/CSE284_SP21_A00/team3/Git/cse284_project/parse_genotype_mafs.py 

Input: /home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_14_files/chr14_genotypes_afr_eur_allelefreqs.csv
Output: /home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_14_files/chr14_genotypes_afr_eur_allelefreqs.bybp.csv

In [5]:
genotype_maf_bybp_df = pd.read_csv('/home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_14_files/chr14_genotypes_afr_eur_allelefreqs.bybp.csv')

In [6]:
genotype_maf_bybp_df.head(20)

Unnamed: 0,CHR,POS,SNP,A2,A1,MAF_AFR,MAF_EUR,AFR_A,AFR_C,AFR_G,AFR_T,EUR_A,EUR_C,EUR_G,EUR_T
0,14,19000017,rs375700886,C,T,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,14,19000050,rs543746158,G,A,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,14,19000056,rs561973970,A,T,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,14,19000059,rs201622908,G,T,0.0,0.00498,0.0,0.0,1.0,0.0,0.0,0.0,0.99502,0.00498
4,14,19000060,rs28973059,C,G,0.1188,0.3884,0.0,0.8812,0.1188,0.0,0.0,0.6116,0.3884,0.0
5,14,19000079,rs558975054,C,A,0.01964,0.0,0.01964,0.98036,0.0,0.0,0.0,1.0,0.0,0.0
6,14,19000096,rs532880059,G,C,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
7,14,19000105,rs28971053,A,G,0.0125,0.0,0.9875,0.0,0.0125,0.0,1.0,0.0,0.0,0.0
8,14,19000115,rs569534039,T,G,0.000893,0.005976,0.0,0.0,0.000893,0.999107,0.0,0.0,0.005976,0.994024
9,14,19000134,rs537379767,C,T,0.000893,0.0,0.0,0.999107,0.0,0.000893,0.0,1.0,0.0,0.0
