### Handling SNP Dataset

- Load SNP dataset (Parquet format)
- Examine each example
- Test PyTorch Dataset class
- etc.

In [1]:
import pyarrow
import fastparquet
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt
%matplotlib inline     
sns.set(color_codes=True)

## _`snp` Dataset_

In [3]:
# load dataset
snp_data = pd.read_parquet("data/HO_data_filtered/HumanOrigins2067_filtered.parquet").to_numpy()

In [6]:
# n_sample x n_markers
snp_data.shape

(160858, 2067)

In [4]:
# unique values of xxx
print(np.unique(snp_data))

[0. 1. 2. 9.]


In [27]:
# numpy to dataframe
snp_data_df = pd.DataFrame(snp_data)

In [29]:
snp_data_df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2057,2058,2059,2060,2061,2062,2063,2064,2065,2066
count,160858.0,160858.0,160858.0,160858.0,160858.0,160858.0,160858.0,160858.0,160858.0,160858.0,...,160858.0,160858.0,160858.0,160858.0,160858.0,160858.0,160858.0,160858.0,160858.0,160858.0
mean,0.456744,0.464658,0.439245,0.459324,0.447668,0.4428,0.463869,0.439095,0.449483,0.406253,...,0.363165,0.378396,0.352771,0.353921,0.462911,0.351733,0.355842,0.349812,0.354008,0.352273
std,0.857356,0.880367,0.73824,0.869027,0.802155,0.764393,0.866785,0.75582,0.820976,0.775762,...,0.8105,0.87497,0.757996,0.748606,1.216274,0.749528,0.773293,0.737285,0.761679,0.75106
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
max,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,...,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0


In [30]:
snp_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160858 entries, 0 to 160857
Columns: 2067 entries, 0 to 2066
dtypes: float64(2067)
memory usage: 2.5 GB


In [28]:
snp_data_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2057,2058,2059,2060,2061,2062,2063,2064,2065,2066
0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0
3,2.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,...,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
4,1.0,2.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
snp_data_df.shape[0]

160858

- *This dataset contains genotype data for `160,858` individuals across `2,067` SNP markers. The values `(0, 1, 2)` represent different genotype states for each SNP, indicating the genetic variation between individuals.*

- *`Rows`: Each row corresponds to a person in the dataset - 160,858 individuals (rows)*

- *`Columns`: Each column corresponds to a specific SNP marker- 2,067 SNP markers (columns)*

- *`Values`: The values represent the genotypes for each individual and SNP. Genotypes are typically represented as:*

    - `0: Homozygous` reference (e.g., the individual has two copies of the reference allele).
    - `1: Heterozygous` (e.g., the individual has one copy of the reference allele and one copy of the alternate allele).
    - `2: Homozygous` alternate (e.g., the individual has two copies of the alternate allele).
    - `9: null` represent null/missing data (alternate `-1`)


In [41]:
# Get unique values in each column
unique_values_per_column = snp_data_df.apply(lambda col: col.unique())
unique_values_per_column

SNP_0                                         [1.0, 0.0, 2.0, 9.0]
SNP_1                                         [1.0, 0.0, 2.0, 9.0]
SNP_2                                         [1.0, 0.0, 2.0, 9.0]
SNP_3                                         [1.0, 0.0, 2.0, 9.0]
SNP_4                                         [1.0, 0.0, 2.0, 9.0]
                                       ...                        
SNP_2063                                      [0.0, 1.0, 2.0, 9.0]
SNP_2064                                      [1.0, 0.0, 2.0, 9.0]
SNP_2065                                      [0.0, 1.0, 2.0, 9.0]
SNP_2066                                      [0.0, 1.0, 2.0, 9.0]
Individual_ID    [AD_066, AD_064, AD_505, AD_523, AD_500, AD_51...
Length: 2068, dtype: object

In [29]:
# Check for missing values
missing_values = snp_data_df.isnull().sum()  # Sum of missing values per column
print(missing_values)

0       0
1       0
2       0
3       0
4       0
       ..
2062    0
2063    0
2064    0
2065    0
2066    0
Length: 2067, dtype: int64


In [2]:
def plot_genotype_hist(genotypes, filename):
    '''
    Plots a histogram of all genotype values in the flattened genotype matrix.

    :param genotypes: array of genotypes
    :param filename: filename (including path) to save plot to
    '''
    plt.figure()
    unique, counts = np.unique(genotypes, return_counts=True)
    d = zip(unique, counts)
    plt.hist(np.ndarray.flatten(genotypes), bins=50)
    if len(unique) < 5:
        plt.title(", ".join(["{:.2f} : {}".format(u, c) for (u, c) in d]), fontdict={'fontsize': 9})

    plt.savefig("{0}.pdf".format(filename))
    plt.close()

## *Minor Allele Frequency*
MAF tells you how common the minor allele (e.g., 1 or 2) is in the population.

In [44]:
def maf(snp_column):
    counts = snp_column.value_counts(normalize=True)
    return min(counts)  # Minor allele frequency is the least frequent value

maf_values = snp_data_df.drop('Individual_ID', axis=1).apply(maf)
print(maf_values)

SNP_0       0.004725
SNP_1       0.005191
SNP_2       0.001946
SNP_3       0.004992
SNP_4       0.003444
              ...   
SNP_2062    0.002686
SNP_2063    0.003164
SNP_2064    0.002462
SNP_2065    0.002928
SNP_2066    0.002754
Length: 2067, dtype: float64


In [32]:
snp_data_df.columns

Index(['SNP_0', 'SNP_1', 'SNP_2', 'SNP_3', 'SNP_4', 'SNP_5', 'SNP_6', 'SNP_7',
       'SNP_8', 'SNP_9',
       ...
       'SNP_2058', 'SNP_2059', 'SNP_2060', 'SNP_2061', 'SNP_2062', 'SNP_2063',
       'SNP_2064', 'SNP_2065', 'SNP_2066', 'Individual_ID'],
      dtype='object', length=2068)

In [33]:
snp_data_df["Individual_ID"] = fam_df["Individual_ID"]  # Add Individual ID

# .Fam + snp
merged_df = fam_df.merge(snp_data_df, on="Individual_ID")  # Merge phenotype & genotypes

In [34]:
merged_df

Unnamed: 0,Family_ID,Individual_ID,Paternal_ID,Maternal_ID,Sex,Phenotype,SNP_0,SNP_1,SNP_2,SNP_3,...,SNP_2057,SNP_2058,SNP_2059,SNP_2060,SNP_2061,SNP_2062,SNP_2063,SNP_2064,SNP_2065,SNP_2066
0,AA,AD_066,0,0,0,1,1.0,1.0,1.0,1.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,AA,AD_064,0,0,0,1,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,AA,AD_505,0,0,0,1,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0
3,AA,AD_523,0,0,0,1,2.0,1.0,1.0,1.0,...,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
4,AA,AD_500,0,0,0,1,1.0,2.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2062,Zapotec,Zapo0082,0,0,0,1,0.0,0.0,0.0,1.0,...,2.0,1.0,2.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0
2063,Zapotec,Zapo0083,0,0,0,1,0.0,0.0,0.0,1.0,...,2.0,1.0,2.0,2.0,2.0,1.0,0.0,0.0,1.0,1.0
2064,Zapotec,Zapo0091,0,0,0,1,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2065,Zapotec,Zapo0095,0,0,0,1,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0


In [35]:
# Melt the SNP dataframe to long format for merging with BIM
snp_long = merged_df.melt(id_vars=['Individual_ID', 'Family_ID', 'Paternal_ID', 'Maternal_ID', 'Sex', 'Phenotype'],
                          var_name="SNP_Column",
                          value_name="Genotype")

In [36]:
snp_long

Unnamed: 0,Individual_ID,Family_ID,Paternal_ID,Maternal_ID,Sex,Phenotype,SNP_Column,Genotype
0,AD_066,AA,0,0,0,1,SNP_0,1.0
1,AD_064,AA,0,0,0,1,SNP_0,0.0
2,AD_505,AA,0,0,0,1,SNP_0,0.0
3,AD_523,AA,0,0,0,1,SNP_0,2.0
4,AD_500,AA,0,0,0,1,SNP_0,1.0
...,...,...,...,...,...,...,...,...
4272484,Zapo0082,Zapotec,0,0,0,1,SNP_2066,1.0
4272485,Zapo0083,Zapotec,0,0,0,1,SNP_2066,1.0
4272486,Zapo0091,Zapotec,0,0,0,1,SNP_2066,0.0
4272487,Zapo0095,Zapotec,0,0,0,1,SNP_2066,0.0
