In [None]:
!pip install qqman
!pip install pyro-ppl
!git clone https://github.com/referreira-wisc/digag2022.git

In [None]:
import os
os.chdir('digag2022/LabGenomics')

### Import required libraries

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from qqman import qqman

## Data preparation

### Read data files

In [None]:
genotype = pd.read_csv('genotype.txt', delimiter=' ')
snp_map = pd.read_csv('snp_map.txt', delimiter=' ')
phenotype = pd.read_csv('phenotype.txt', delimiter=' ')

### Recode genotypes and convert to transposed matrix (SNPs on columns)

In [None]:
genotype = genotype.replace("AA", 0)
genotype = genotype.replace("AB", 1)
genotype = genotype.replace("BA", 1)
genotype = genotype.replace("BB", 2)
genotype = np.array(genotype, dtype=int)
genotype = genotype.transpose()

### Calculate minor allele frequencies

In [None]:
n = genotype.shape[0]
q = np.mean(genotype, axis=0) / 2
p = 1 - q
maf = np.minimum(p, q)

### Plot minor allele frequencies histogram

In [None]:
plt.hist(maf, histtype='bar', facecolor='w', edgecolor='k')
plt.xlabel('Minor Allele Frequency')
plt.ylabel('Frequency')
plt.title('Before QC')
plt.show()

### Plot minor allele frequencies histogram after removing frequencies < 1%

In [None]:
plt.hist(maf[maf >= 0.01], histtype='bar', facecolor='w', edgecolor='k')
plt.xlabel('Minor Allele Frequency')
plt.ylabel('Frequency')
plt.title('After QC')
plt.show()

### Perform Chi-squared test of Hardy-Weinberg proportions

In [None]:
pp = np.sum(genotype == 0, axis=0)
pq = np.sum(genotype == 1, axis=0)
qq = np.sum(genotype == 2, axis=0)
pp_expected = p * p * n
pq_expected = 2 * p * q * n
qq_expected = q * q * n
chi2_stat = ((pp - pp_expected)**2 / pp_expected) + ((pq - pq_expected)**2 / pq_expected) + ((qq - qq_expected)**2 / qq_expected)
chi2_p = 1 - stats.chi2.cdf(chi2_stat, 1)

### Plot Q-Q plot for Hardy-Weinberg proportions

In [None]:
qqman.qqplot(chi2_p, show=True)

## Data editing

### Eliminate markers with maf less than 1% and p-value on Chi-squared test of Hardy-Weinberg proportions less than 1e-10

In [None]:
snps_ok = np.logical_and(maf >= 0.01, chi2_p>=1e-10)
markers = genotype[:, snps_ok]