## Produce a PCA of the real data

In [1]:
import numpy as np
import allel
import seaborn as sns
import pandas as pd
import sys
sys.path.append('../../../')
from mxbgenomes.utils import load_populations_info

In [2]:
# Samples to use for PCA analysis
popinfo = load_populations_info('../../../')
popinfo = popinfo[popinfo.Subpopulation.isin(['MXL', 'MXB', 'CHB', 'YRI', 'IBS'])]
samples = popinfo.Samplename.tolist()

In [3]:
# read the vcf file for chr22
vcf = allel.read_vcf('../../../results/data/210713-HardyW-filters/1TGP_and_50MXB-chr22-snps-vep-mask-HW-GRCh38.vcf.gz')


In [4]:
# subset the samples in the VCF
samples_vcf = [x for x in vcf['samples'] if x in samples]
samples_vcf_indicator = [x in samples for x in vcf['samples']]
del samples

In [5]:
g = vcf['calldata/GT'][:, samples_vcf_indicator, :]
g = allel.GenotypeArray(g)

In [6]:
# Count alleles at each variant. 
ac = g.count_alleles()
ac

Unnamed: 0,0,1,2,Unnamed: 4
0,861,3,0,
1,864,0,0,
2,862,2,0,
...,...,...,...,...
351863,863,1,0,
351864,862,2,0,
351865,864,0,0,


In [7]:
# filter multiallelic snps and singletons
flt = (ac.max_allele() == 1) & (ac[:, :2].min(axis=1) > 1)
gf = g.compress(flt, axis=0)
gf

Unnamed: 0,0,1,2,3,4,...,427,428,429,430,431,Unnamed: 12
0,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,
1,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,
2,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,
...,...,...,...,...,...,...,...,...,...,...,...,...
199503,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,
199504,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,
199505,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,


In [8]:
# transform the genotype data into a 2-dimensional matrix where 
# each cell has the number of non-reference alleles
# per call. This is what we’ll use as the input to PCA.

gn = gf.to_n_alt()
gn

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int8)

In [9]:
# Removing correlated features (LD pruning)
def ld_prune(gn, size, step, threshold=.1, n_iter=1):
    for i in range(n_iter):
        loc_unlinked = allel.locate_unlinked(gn, size=size, step=step, threshold=threshold)
        n = np.count_nonzero(loc_unlinked)
        n_remove = gn.shape[0] - n
        print('iteration', i+1, 'retaining', n, 'removing', n_remove, 'variants')
        gn = gn.compress(loc_unlinked, axis=0)
    return gn

gnu = ld_prune(gn, size=500, step=200, threshold=.1, n_iter=3)


iteration 1 retaining 35457 removing 164049 variants
iteration 2 retaining 14640 removing 20817 variants
iteration 3 retaining 9472 removing 5168 variants


In [10]:
coords1, model1 = allel.pca(gnu, n_components=10, scaler='patterson')

In [11]:
model1.explained_variance_ratio_[0]*100

2.6630904525518417

In [12]:
model1.explained_variance_ratio_[1]*100

1.987387239933014

In [13]:
colnames = ['PC_' + str(x) for x in range(1, 11)]
coord_pca = pd.DataFrame(coords1, columns=colnames)
coord_pca['Samplename'] = samples_vcf

In [14]:
coord_pca.merge(popinfo).to_csv('results/PCA-real-data.csv', index=False)

# PCA from simulated data

In [18]:
vcf = allel.read_vcf('data/simulated-genomes-chr22.vcf')

In [19]:
# SAME PROCEDURE AS ABOVE
g = vcf['calldata/GT']
g = allel.GenotypeArray(g)
ac = g.count_alleles()

flt = (ac.max_allele() == 1) & (ac[:, :2].min(axis=1) > 1)
gf = g.compress(flt, axis=0)

gn = gf.to_n_alt()
gnu = ld_prune(gn, size=500, step=200, threshold=.1, n_iter=3)
coords1, model1 = allel.pca(gnu, n_components=10, scaler='patterson')


colnames = ['PC_' + str(x) for x in range(1, 11)]
coord_pca = pd.DataFrame(coords1, columns=colnames)


coord_pca['Samplename'] = vcf['samples']


iteration 1 retaining 52728 removing 210994 variants
iteration 2 retaining 19613 removing 33115 variants
iteration 3 retaining 10887 removing 8726 variants


In [20]:
coord_pca.to_csv('results/PCA-simulated-data.csv', index=False)

In [22]:
model1.explained_variance_ratio_[0]*100

2.6914456859230995

In [21]:


model1.explained_variance_ratio_[1]*100


1.9352370873093605