## Produce a PCA of the real data

In [1]:
import numpy as np
import allel
import seaborn as sns
import pandas as pd
import sys
sys.path.append('../../../')
from mxbgenomes.utils import load_populations_info

In [2]:
# Samples to use for PCA analysis
popinfo = load_populations_info('../../../')
popinfo = popinfo[popinfo.Subpopulation.isin(['MXL', 'MXB', 'CHB', 'YRI', 'IBS', 'PEL', 'CLM', 'PUR'])]
samples = popinfo.Samplename.tolist()

In [3]:
# read the vcf faile for chr1
vcf = allel.read_vcf('../../../results/data/210713-HardyW-filters/1TGP_and_50MXB-chr22-snps-vep-mask-HW-GRCh38.vcf.gz')


In [4]:
# subset the samples in the VCF
samples_vcf = [x for x in vcf['samples'] if x in samples]
samples_vcf_indicator = [x in samples for x in vcf['samples']]
del samples

In [5]:
g = vcf['calldata/GT'][:, samples_vcf_indicator, :]
g = allel.GenotypeArray(g)

In [6]:
# Count alleles at each variant. 
ac = g.count_alleles()
ac

Unnamed: 0,0,1,2,Unnamed: 4
0,1427,3,0,
1,1429,1,0,
2,1428,2,0,
...,...,...,...,...
351863,1429,1,0,
351864,1428,2,0,
351865,1429,1,0,


In [7]:
# filter multiallelic snps and singletons
flt = (ac.max_allele() == 1) & (ac[:, :2].min(axis=1) > 1)
gf = g.compress(flt, axis=0)
gf

Unnamed: 0,0,1,2,3,4,...,710,711,712,713,714,Unnamed: 12
0,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,
1,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,
2,0/0,1/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,
...,...,...,...,...,...,...,...,...,...,...,...,...
230582,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,
230583,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,
230584,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,


In [8]:
# transform the genotype data into a 2-dimensional matrix where 
# each cell has the number of non-reference alleles
# per call. This is what we’ll use as the input to PCA.

gn = gf.to_n_alt()
gn

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int8)

In [9]:
# Removing correlated features (LD pruning)
def ld_prune(gn, size, step, threshold=.1, n_iter=1):
    for i in range(n_iter):
        loc_unlinked = allel.locate_unlinked(gn, size=size, step=step, threshold=threshold)
        n = np.count_nonzero(loc_unlinked)
        n_remove = gn.shape[0] - n
        print('iteration', i+1, 'retaining', n, 'removing', n_remove, 'variants')
        gn = gn.compress(loc_unlinked, axis=0)
    return gn

gnu = ld_prune(gn, size=500, step=200, threshold=.1, n_iter=1)


iteration 1 retaining 52947 removing 177638 variants


In [10]:
coords1, model1 = allel.pca(gnu, n_components=10, scaler='patterson')

In [11]:
model1.explained_variance_ratio_[0]*100

1.508464477956295

In [12]:
model1.explained_variance_ratio_[1]*100

0.870104692876339

In [13]:
model1.explained_variance_ratio_[2]*100

0.654530618339777

In [14]:
colnames = ['PC_' + str(x) for x in range(1, 11)]
coord_pca = pd.DataFrame(coords1, columns=colnames)
coord_pca['Samplename'] = samples_vcf

In [15]:
coord_pca.merge(popinfo).to_csv('results/PCA-real-data.csv', index=False)

# PCA from simulated data

In [16]:
vcf = allel.read_vcf('data/simulated-genomes-chr22.vcf')

In [17]:
# SAME PROCEDURE AS ABOVE
g = vcf['calldata/GT']
g = allel.GenotypeArray(g)
ac = g.count_alleles()

flt = (ac.max_allele() == 1) & (ac[:, :2].min(axis=1) > 1)
gf = g.compress(flt, axis=0)

gn = gf.to_n_alt()
gnu = ld_prune(gn, size=500, step=200, threshold=.1, n_iter=1)
coords1, model1 = allel.pca(gnu, n_components=10, scaler='patterson')


colnames = ['PC_' + str(x) for x in range(1, 11)]
coord_pca = pd.DataFrame(coords1, columns=colnames)


coord_pca['Samplename'] = vcf['samples']


iteration 1 retaining 86981 removing 250515 variants


In [18]:
coord_pca.to_csv('results/PCA-simulated-data.csv', index=False)

In [19]:
model1.explained_variance_ratio_[0]*100

1.325659453868866

In [20]:


model1.explained_variance_ratio_[1]*100


0.7790993433445692

In [21]:
model1.explained_variance_ratio_[2]*100

0.6411112844944