In [1]:
import os
import pandas as pd
import numpy as np

In [37]:
# Get list of 1000 Genome samples in the AFR and EUR populations
#chr21_dir="/home/lbruce/project_files/chromosome_21_files/"
chr21_dir="/home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_21_files/"

# AFR Labels from ps2_reference_labels.csv
afr_samples = pd.read_csv(chr21_dir + "AFR_labels.csv", header=None)[0].tolist()
# EUR Labels from ps2_reference_labels.csv
eur_samples = pd.read_csv(chr21_dir + "EUR_labels.csv", header=None)[0].tolist()

chr21_vcf = chr21_dir + "ALL.chr21.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz"

In [None]:
%%bash
# Extracted chr pos rsid to map to plink output
# Ran in terminal
#zcat /home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_21_files/ALL.chr21.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz | grep -v '#' | awk -F'\t' '{print $1"\t"$2"\t"$3}' > /home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_21_files/chr21_chr_pos_rsid.txt &

In [48]:
chr21_pos_rsid_df=pd.read_csv("/home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_21_files/chr21_chr_pos_rsid.txt", header=None, sep='\t')
chr21_pos_rsid_df.columns=["CHR","POS","SNP"]
chr21_pos_rsid_df.head()

Unnamed: 0,CHR,POS,SNP
0,21,9411239,rs559462325
1,21,9411245,rs181691356
2,21,9411264,rs548263598
3,21,9411267,rs561987868
4,21,9411302,rs531010746


In [47]:
# Get List of column indexes in the vcf file for the two populations
# VCF file: ALL.chr21.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz 
# VCF '#CHROM' line file: chr21_vcf_header.txt
chr21_header = pd.read_csv(chr21_dir + "chr21_vcf_header.txt", sep='\t', header=None).values.tolist()[0]

# Get sample names for the two populations
afr_columns = []
eur_columns = []
for idx,c in enumerate(chr21_header):
    if c in afr_samples:
        afr_columns.append(c)
    elif c in eur_samples:
        eur_columns.append(c)
print(len(afr_columns), print(len(eur_columns)))

# Create bcftools command to extract samples by ancestry population
# command  bcftools view -Ou -s sample1,sample2 file.vcf
#print(afr_columns, eur_columns)
afr_columns_formatted = ','.join(afr_columns)
eur_columns_formatted = ','.join(eur_columns)

# Executed bcftools commands in terminal but could also use os.system()
chr21_afr_vcf = chr21_dir + "AFR.chr21.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz"
#print("bcftools view -s {0} {1} | bgzip -c > {2} && tabix -s1 -b2 -e2 {2}".format(afr_columns_formatted, chr21_vcf, chr21_afr_vcf))

chr21_eur_vcf = chr21_dir + "EUR.chr21.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz"
#print("bcftools view -s {0} {1} | bgzip -c > {2} && tabix -s1 -b2 -e2 {2}".format(eur_columns_formatted, chr21_vcf, chr21_eur_vcf))

502
560 None


In [48]:
%%bash

# Calculate Minor Allele frequencies for each population
plink --vcf /home/lbruce/project_files/chromosome_21_files/AFR.chr21.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz --freq --out /home/lbruce/project_files/chromosome_21_files/afr_genotype_maf
plink --vcf /home/lbruce/project_files/chromosome_21_files/EUR.chr21.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz --freq --out /home/lbruce/project_files/chromosome_21_files/eur_genotype_maf

# Get Counts of genotype for each population
plink --vcf /home/lbruce/project_files/chromosome_21_files/AFR.chr21.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz --freqx --out /home/lbruce/project_files/chromosome_21_files/afr_genotype_counts
plink --vcf /home/lbruce/project_files/chromosome_21_files/EUR.chr21.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz --freqx --out /home/lbruce/project_files/chromosome_21_files/eur_genotype_counts

# Extract population allele frequencies:  
# VCF: 21      9411245 rs181691356     C       A  EAS_AF=0;AMR_AF=0.0014;AFR_AF=0.0008;EUR_AF=0.002;SAS_AF=0
# Plink output run on AFR.vcf: rs181691356    A    C    0.0008929     1120
# MAF calculations match :)

PLINK v1.90b6.9 64-bit (4 Mar 2019)            www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to /home/lbruce/project_files/chromosome_21_files/afr_genotype_maf.log.
Options in effect:
  --freq
  --out /home/lbruce/project_files/chromosome_21_files/afr_genotype_maf
  --vcf /home/lbruce/project_files/chromosome_21_files/AFR.chr21.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz

385381 MB RAM detected; reserving 192690 MB for main workspace.
--vcf: 1k variants complete.--vcf: 2k variants complete.--vcf: 3k variants complete.--vcf: 4k variants complete.--vcf: 5k variants complete.--vcf: 6k variants complete.--vcf: 7k variants complete.--vcf: 8k variants complete.--vcf: 9k variants complete.--vcf: 10k variants complete.--vcf: 11k variants complete.--vcf: 12k variants complete.--vcf: 13k variants complete.--vcf: 14k variants complete.--vcf: 15k variants complete.--vcf: 16k variants compl

In [30]:
# Plink format: Forces Major allele = A2, Minor allele = A1

# Extract AFR minor allele freqs
afr_genotype_maf_df = pd.read_csv("/home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_21_files/afr_genotype_maf.frq", delim_whitespace=True)
afr_genotype_maf_df.rename(columns={"MAF":"MAF_AFR"}, inplace=True)
display(afr_genotype_maf_df.head())

# Extract EUR minor allele freqs
eur_genotype_maf_df = pd.read_csv("/home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_21_files/eur_genotype_maf.frq", delim_whitespace=True)
eur_genotype_maf_df.rename(columns={"MAF":"MAF_EUR"}, inplace=True)
display(eur_genotype_maf_df.head())

Unnamed: 0,CHR,SNP,A1,A2,MAF_AFR,NCHROBS
0,21,rs559462325,A,G,0.0,1120
1,21,rs181691356,A,C,0.000893,1120
2,21,rs548263598,C,A,0.000893,1120
3,21,rs561987868,T,G,0.0,1120
4,21,rs531010746,T,G,0.01161,1120


Unnamed: 0,CHR,SNP,A1,A2,MAF_EUR,NCHROBS
0,21,rs559462325,A,G,0.0,1004
1,21,rs181691356,A,C,0.001992,1004
2,21,rs548263598,C,A,0.0,1004
3,21,rs561987868,T,G,0.0,1004
4,21,rs531010746,T,G,0.0,1004


In [49]:
# P("TT" at SNPi | AB) = freq(T at SNPi in pop A) * freq(T at SNPi in pop B)
genotype_maf_df = pd.merge(afr_genotype_maf_df, eur_genotype_maf_df, how='inner', on=["CHR", "SNP", "A1", "A2"])
genotype_maf_df.drop(columns=["NCHROBS_x", "NCHROBS_y"], inplace=True)

genotype_maf_df = pd.merge(genotype_maf_df,chr21_pos_rsid_df,how='inner', on=["CHR", "SNP"])
display(genotype_maf_df.shape)


(1083666, 7)

In [50]:
display(genotype_maf_df.head())
display(genotype_maf_df.shape)

Unnamed: 0,CHR,SNP,A1,A2,MAF_AFR,MAF_EUR,POS
0,21,rs559462325,A,G,0.0,0.0,9411239
1,21,rs181691356,A,C,0.000893,0.001992,9411245
2,21,rs548263598,C,A,0.000893,0.0,9411264
3,21,rs561987868,T,G,0.0,0.0,9411267
4,21,rs531010746,T,G,0.01161,0.0,9411302


(1083666, 7)

In [51]:
genotype_maf_df.to_csv("/home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_21_files/chr21_genotypes_afr_eur_allelefreqs.csv", header=True, index=False)

## Calculate BP Probabilities for both AFR and EUR Populations
Put in Python Script because it was too much for jupyter notebooks: parse_genotype_mafs.py 

Input: /home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_21_files/chr21_genotypes_afr_eur_allelefreqs.csv
Output: /home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_21_files/chr21_genotypes_afr_eur_allelefreqs.bybp.csv

In [52]:
genotype_maf_bybp_df = pd.read_csv('/home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_21_files/chr21_genotypes_afr_eur_allelefreqs.bybp.csv')

In [53]:
genotype_maf_bybp_df.head(20)

Unnamed: 0,CHR,POS,SNP,REF,ALT,MAF_AFR,MAF_EUR,AFR_A,AFR_C,AFR_G,AFR_T,EUR_A,EUR_C,EUR_G,EUR_T
0,21,9411239,rs559462325,G,A,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,21,9411245,rs181691356,C,A,0.000893,0.001992,0.000893,0.999107,0.0,0.0,0.001992,0.998008,0.0,0.0
2,21,9411264,rs548263598,A,C,0.000893,0.0,0.999107,0.000893,0.0,0.0,1.0,0.0,0.0,0.0
3,21,9411267,rs561987868,G,T,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,21,9411302,rs531010746,G,T,0.01161,0.0,0.0,0.0,0.98839,0.01161,0.0,0.0,1.0,0.0
5,21,9411313,rs550852792,G,A,0.01339,0.0,0.01339,0.0,0.98661,0.0,0.0,0.0,1.0,0.0
6,21,9411332,rs571137411,G,T,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
7,21,9411347,rs539713234,G,C,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
8,21,9411356,rs552178335,G,A,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
9,21,9411358,rs565663130,C,T,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


## OLD REMOVE LATER

In [None]:
#genotype_maf_df['P_AFRmin_EURmin'] = genotype_maf_df['MAF_AFR'] * genotype_maf_df['MAF_EUR']
#genotype_maf_df['P_AFRmaj_EURmin'] = (1-genotype_maf_df['MAF_AFR']) * genotype_maf_df['MAF_EUR']
#genotype_maf_df['P_AFRmin_EURmaj'] = genotype_maf_df['MAF_AFR'] * (1 - genotype_maf_df['MAF_EUR'])
#genotype_maf_df['P_AFRmaj_EURmaj'] = (1- genotype_maf_df['MAF_AFR']) * (1 - genotype_maf_df['MAF_EUR'])

#genotype_maf_df['P_EURmaj_EURmaj'] = 2 * (1 - genotype_maf_df['MAF_EUR'])
#genotype_maf_df['P_AFRmaj_AFRmaj'] = 2 * (1 - genotype_maf_df['MAF_AFR'])
#genotype_maf_df['P_EURmin_EURmin'] = 2 * (genotype_maf_df['MAF_EUR'])
#genotype_maf_df['P_AFRmin_AFRmin'] = 2 * (genotype_maf_df['MAF_AFR'])

In [None]:
"""
genotype_maf_df['AFR_A'] = np.nan
genotype_maf_df['AFR_C'] = np.nan
genotype_maf_df['AFR_G'] = np.nan
genotype_maf_df['AFR_T'] = np.nan

genotype_maf_df['EUR_A'] = np.nan
genotype_maf_df['EUR_C'] = np.nan
genotype_maf_df['EUR_G'] = np.nan
genotype_maf_df['EUR_T'] = np.nan
"""

In [None]:

# Add empty columns that will contain probability by base pair


from collections import OrderedDict

def calculate_bp_prob(row, genotype_maf_df):
    
    # Start with all bps having a 0 probability 
    # and only update if bp is in A1 or A2
    afr_prob_dict = OrderedDict([('A',0), ('C',0), ('G',0), ('T',0)])
    eur_prob_dict = OrderedDict([('A',0), ('C',0), ('G',0), ('T',0)])
    
    # A1 = Alternate allele, P = MAF
    # A2 = Reference allele, P = 1-MAF
    
    # Set reference alelle probability
    afr_prob_dict[row['A2']] = 1 - float(row['MAF_AFR'])
    eur_prob_dict[row['A2']] = 1 - float(row['MAF_AFR'])
      
    # Set alternate alelle probability
    afr_prob_dict[row['A1']] = float(row['MAF_AFR'])
    eur_prob_dict[row['A1']] = float(row['MAF_AFR'])
    
    #print(afr_prob_dict)
    #print(eur_prob_dict)
    return(afr_prob_dict, eur_prob_dict)


# Loop through all SNPs and calculate the Values
i = 0
AFR_A =[]
AFR_C =[]
AFR_G =[]
AFR_T =[]
EUR_A =[]
EUR_C =[]
EUR_G =[]
EUR_T =[]
for index, row in genotype_maf_df.iterrows():
    #print(row)
    (afr_prob_dict, eur_prob_dict) = calculate_bp_prob(row, genotype_maf_df)
    AFR_A.append(afr_prob_dict['A'])
    AFR_C.append(afr_prob_dict['C'])
    AFR_G.append(afr_prob_dict['G'])
    AFR_T.append(afr_prob_dict['T'])
    
    EUR_A.append(eur_prob_dict['A'])
    EUR_C.append(eur_prob_dict['C'])
    EUR_G.append(eur_prob_dict['G'])
    EUR_T.append(eur_prob_dict['T'])
        
    #break

print(AFR_A)
print(len(AFR_A))
