In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
def get_rsID(INFO):
    return [x.split('=')[1] for x in INFO.split(';') if x.startswith('avsnp150')][0]

def get_maxMAF(INFO):
    freq_columns = ['ExAC_nontcga_ALL', 'esp6500siv2_all']
    freqs = []
    for info in INFO.split(';'):
        if info.startswith('ExAC_nontcga_ALL') or info.startswith('esp6500siv2_all'):
            freq = info.split('=')[1]
            freqs.append(float('0'+freq))
    return max(freqs)

def is_exonic(INFO):
    functions = [x.split('=')[1] for x in INFO.split(';') if x.startswith('Func.ensGene') or x.startswith('Func.refGene') or x.startswith('Func.knownGene')]
    return 'exonic' in functions
    

## VCF file cleanup

In this notebook, we will prepare a VCF ([variant call format](https://en.wikipedia.org/wiki/Variant_Call_Format)) file to apply ML models. These files contain information about the changes in the genome (e.g. A>C, T>G, etc.). The whole human genome has about 3 billion positions, but we usually have between 4 and 7 million variants, as more than 99% of our genomes are the same.

### VCF file parts
The **VCF** file has a long header (all lines starting with ##) with information about the chromosomes present in the file, the programs/parameters used to generate the file and the meaning of some fields we see in the file.

After the long header, we have the column labels, which is a single line starting with #. This line has the following columns:
1. CHROM: variant chromosome 
2. POS: variant position in the chromosome
3. ID: variant ID (usually rsID, from [dbSNP](https://www.ncbi.nlm.nih.gov/snp/))
4. REF: reference allele (A, T, C, G, ...)
5. ALT: alternate allele (A, T, C, G, ...)
6. QUAL: variant quality
7. FILTER: variant quality filter (usually generated by the [GATK VQSR](https://gatk.broadinstitute.org/hc/en-us/articles/360035531612-Variant-Quality-Score-Recalibration-VQSR-) tool)
8. INFO: variant annotation (information like the gene name, variant location in the gene, functional impact, frequency in the population, etc.)
9. FORMAT: order of additional fields about the genotypes
10. ... From column 10 on, we have the information about the samples. The main information we want is how many copies each sample has of the reference (always represented by 0) and alternate alleles (represented by 1, 2, 3, ...). Let's say a specific variant is a change from A to G. Then A is the reference allele and G is the alternate allele. If a patient has two copies of the reference allele, we will see the genotype as "0/0". If the patient has one copy of the reference and one copy of the alternate allele, we will see the genotype as "0/1", and if the patient has two copies of the alternate allele, we will see "1/1". In some cases, more than one alternate alleles will be found (let's say most individuals have a T, but some can have a C and some can have a G). In these cases, we will see other combinations like "0/2" (the patient has one copy of the reference and one copy of the second most frequent alternate allele).

To simplify the analysis, we coded the variants as binary data, meaning that patients with at least one copy of the alternate allele will have 1 for that variant and if the patient has only the reference allele, the value will be 0.

To filter the variants for this competition, we selected variants that passed the GATK VQSR quality control, we removed variants that are outside genes (intergenic regions of the genome), and selected only variants with frequency of 10% or less.

In [None]:
data_dir = '/kaggle/input/end-als/end-als/genomics-data'

filename = data_dir + '/AnswerALS_subset_annovar.hg38_anno_and_geno.no_intergenic.vcf'

chunksize = 2 * 10 ** 4
nrows = 10945503

chunks = pd.read_csv(filename, sep = '\t', skiprows=3518, chunksize=chunksize)

df = pd.DataFrame()

for i,chunk in enumerate(chunks):
    if i % 100 == 0:
        print(f"{i+1} out of {int(nrows/chunksize)} chunks.")
    # Selecting PASSing variants (according to GATK VQSR)
    data = chunk[chunk.FILTER == 'PASS']
    # Selecting exonic variants only
    data = data[data.INFO.apply(is_exonic)]
    # Selecting variants with frequency 10% or less
    data = data[data.INFO.apply(lambda x: get_maxMAF(x) <= 0.1)]
    # Defining index with SNP ID
    data.index = data[['#CHROM', 'POS', 'INFO']].apply(lambda x: get_rsID(x[2]) if get_rsID(x[2]) != '.' else x[0]+':'+str(x[1]), axis=1)
    # Selecting genotype columns only
    geno = data.iloc[:,9:]
    # Replacing genotype codes with 0s and 1s
    for col in geno.columns:
        geno[col] = geno[col].apply(lambda x: 0 if x.split(':')[0] == '0/0' else (np.nan if x.split(':')[0] == './.' else 1))
    
    
    #print(geno.shape)
    #geno.to_csv(f'{data_dir}/geno{i+1}.csv')
    
    # Concatenating to the dataframe
    df = pd.concat([df, geno])

In [None]:
meta_dir = '/kaggle/input/end-als/end-als/clinical-data/filtered-metadata/metadata/'

metadata = pd.read_csv(meta_dir + 'aals_released_files.csv')

metadata.head()

In [None]:
metadata = metadata[['Participant_ID', 'CGND_ID']].drop_duplicates()
metadata = metadata[-metadata.CGND_ID.isnull()]
metadata = metadata[-metadata.Participant_ID.isnull()]

for i,row in metadata.iterrows():
    if row['CGND_ID'] in df.columns:
        df = df.rename(columns = {row['CGND_ID'] : row['Participant_ID']})
    if row['CGND_ID']+'-b38' in df.columns:
        df = df.rename(columns = {row['CGND_ID']+'-b38' : row['Participant_ID']})

In [None]:
df.head()

In [None]:
#df.to_csv(data_dir + 'geno.csv')