In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
!pip install toolz scikit-allel

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-allel
  Downloading scikit_allel-1.3.5-cp38-cp38-manylinux2010_x86_64.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m78.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: scikit-allel
Successfully installed scikit-allel-1.3.5


## Setup

In [None]:
import os
# os.environ["MODIN_CPUS"] = "8"
# from distributed import Client
# client = Client()
import math
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from scipy.spatial.distance import squareform
import allel
from matplotlib import pyplot as plt
%matplotlib inline   
import tensorflow_datasets as tfds
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LassoCV, ElasticNetCV

# print("Tensorflow version " + tf.__version__)
# resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
# tf.config.experimental_connect_to_cluster(resolver)
# # This is the TPU initialization code that has to be at the beginning.
# tf.tpu.experimental.initialize_tpu_system(resolver)
# print("All devices: ", tf.config.list_logical_devices('TPU'))
# strategy = tf.distribute.TPUStrategy(resolver)

## Prepare the data

In [None]:
root_dir = '[path]/'
vcf_file = 'ALL.chr22.mergedSV.v8.20130502.svs.genotypes.vcf'

n_header_lines = []
n_data_header = ""
# get header
with open(root_dir+vcf_file, 'r') as f_in:
    # skip info
    for line_num in range(70):
        n_header_lines.append(f_in.readline())

    n_data_header = f_in.readline()
# load data

# load genotype
genotypes = pd.read_csv(root_dir+vcf_file, comment='#', sep='\t', names=n_data_header.strip().split('\t'))

headers = genotypes.columns[9:]
genotypes

Unnamed: 0,#CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,HG00096,...,NA21128,NA21129,NA21130,NA21133,NA21135,NA21137,NA21141,NA21142,NA21143,NA21144
0,22,16050654,DUP_gs_CNV_22_16050654_16063474,A,"<CN0>,<CN2>,<CN3>,<CN4>",.,PASS,"AC=9,87,599,20;AF=0.00179712,0.0173722,0.11960...",GT,3|0,...,0|0,0|0,0|0,0|0,0|0,3|0,3|0,0|0,0|0,0|0
1,22,16533236,SI_BD_17525,C,<CN0>,100,PASS,AC=125;AF=0.0249601;AFR_AF=0.09;AMR_AF=0.0086;...,GT,0|0,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
2,22,16577743,YL_CN_CEU_5170,T,<CN0>,100,PASS,AC=29;AF=0.00579073;AFR_AF=0.0098;AMR_AF=0.001...,GT,0|0,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
3,22,16589908,SI_BD_17528,T,<CN0>,100,PASS,AC=186;AF=0.0371406;AFR_AF=0.1021;AMR_AF=0.014...,GT,0|0,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
4,22,16633635,YL_CN_STU_4360,G,<CN0>,100,PASS,AC=2;AF=0.00039936;AFR_AF=0;AMR_AF=0;AN=5008;C...,GT,0|0,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
843,22,51054942,UW_VH_22595,T,<CN0>,100,PASS,AC=1;AF=0.00019968;AFR_AF=0;AMR_AF=0;AN=5008;C...,GT,0|0,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
844,22,51066406,DUP_gs_CNV_22_51066406_51078951,A,"<CN0>,<CN2>",.,PASS,"AC=6,47;AF=0.00119808,0.00938498;AFR_AF=0.0023...",GT,0|0,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
845,22,51068654,ALU_umary_ALU_12538,G,<INS:ME:ALU>,.,.,AC=3;AF=0.00059904;AFR_AF=0.0023;AMR_AF=0;AN=5...,GT,0|0,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
846,22,51163690,BI_GS_DEL1_B2_P2897_127,C,<CN0>,100,PASS,AC=1;AF=0.00019968;AFR_AF=0;AMR_AF=0;AN=5008;C...,GT,0|0,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0


In [None]:
geno_headers = genotypes.POS.values
positions = [int(h) for h in geno_headers]
positions[:2]

[16050654, 16533236]

In [None]:
def find_snv_chunk(s):
  s_split = s.split(";")
  for ss in s_split:
    if ss.startswith("SVTYPE"):
      return ss
  return ""

In [None]:
info = genotypes.iloc[:, 7:8]
info_del = info.applymap(lambda x: find_snv_chunk(x).split("=")[-1])
np.unique(info_del.INFO.values, return_counts=True)

(array(['ALU', 'CNV', 'DEL', 'DEL_ALU', 'DUP', 'INS', 'INV', 'LINE1',
        'SVA'], dtype=object),
 array([ 96,  55, 573,   9,  79,   4,   5,   7,  20]))

In [None]:
info = genotypes.iloc[:, 7:8]
info_del = info.applymap(lambda x: find_snv_chunk(x).split("=")[-1])
info = info[info_del.INFO == "DEL"].applymap(lambda x: float(x.split(";")[2].split("=")[-1]))
plt.hist(list(map(lambda x: min(x, 1-x), info.values.flatten().tolist())))

In [None]:
info = genotypes.iloc[:, 7:8]
info = info.applymap(lambda x: x.split(";")[-1].split("=")[-1])
X = genotypes[info.INFO == "DEL"]
X = X.iloc[:, 9:].T
print(X.shape, type(X))
X.head()

# X.values.shape

In [None]:
g = X.replace({
    '0|0': 0,
    '0|1': 1,
    '1|0': 2,
    '1|1': 3
})
r = allel.rogers_huff_r(g.T)
LD = squareform(r ** 2)
plt.figure(figsize=(8,8))
plt.imshow(LD)

In [None]:
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]
plt.hist(np.amax(LD, axis=1), bins=bins)

In [None]:
with open(root_dir + "DELL.chr22.genotypes.for.modeling.vcf", 'w') as f_out:
    # write info
    f_out.writelines(n_header_lines)
X.to_csv(root_dir + "DELL.chr22.genotypes.for.modeling.vcf", sep="\t", mode='a', index=True, index_label='Sample_id')

In [None]:
print(genotypes.shape)
info = genotypes.iloc[:, 7:8]
info = info.applymap(lambda x: x.split(";")[-1].split("=")[-1])

X = genotypes[info.INFO == "DEL"]
print(X.shape)
# X = X.iloc[:, 9:].T
# print(X.shape, type(X), np.unique(X.values))
X

In [None]:
# with open(root_dir + "DELL.chr22.genotypes.for.modeling.vcf", 'w') as f_out:
with open(root_dir + "DELL.chr22.genotypes.full.vcf", 'w') as f_out:
    # write info
    f_out.writelines(n_header_lines)
X.to_csv(root_dir + "DELL.chr22.genotypes.full.vcf", sep="\t", mode='a', index=False)
# X.to_csv(root_dir + "DELL.chr22.genotypes.for.modeling.vcf", sep="\t", mode='a', index=True, index_label='Sample_id')

In [None]:
new_data_header = ""
# get header
with open(root_dir + "DELL.chr22.genotypes.for.modeling.vcf", 'r') as f_in:
    # skip info
    for line_num in range(70):
        f_in.readline()

    new_data_header = f_in.readline()
# load data

# load genotype
temp = pd.read_csv(root_dir + "DELL.chr22.genotypes.for.modeling.vcf", comment='#', sep='\t', names=new_data_header.strip().split('\t'), header=1, index_col='Sample_id', dtype={'Sample_id':str})
temp

In [None]:
# load genotype
reference = genotypes.drop(["#CHROM"], axis=1)

In [None]:
reference

In [None]:
genotypes = reference[reference.POS.isin(positions)].drop_duplicates(subset=['POS'], inplace=False).iloc[:, 8:].T
print(genotypes.shape)
genotypes.head()

In [None]:
ped_file = '/content/drive/MyDrive/Colab Notebooks/integrated_call_samples.20130502.ALL.ped'
pedigree = pd.read_csv(ped_file, sep='\t', index_col='Individual ID')

In [None]:
pedigree.head()

In [None]:
Y_train = pedigree.loc[genotypes.index]['Population']
# Y_train = pedigree.loc[genotypes.index][pedigree.loc[genotypes.index]['Population'] == "YRI"]['Population']
Y_train.shape

In [None]:
X = X[X.index.isin(Y_train.index)]
X.shape

In [None]:
_X = X.replace({
    '0|0': 0,
    '0|1': 1,
    '1|0': 2,
    '1|1': 3
})

In [None]:
def freq_mapper(g):
  return 0 if g == 0 else 1 if g==1 or g==2 else 2

def maf_mapper(g):
  return min(g, 1-g)

In [None]:
_X = _X.applymap(freq_mapper)
_X = _X.sum(axis=0)/(2*X.shape[0])
_X

In [None]:
mafs = _X.map(maf_mapper)
mafs

In [None]:
bin_labels = np.digitize(mafs, bins=[0, 0.1, 0.2, 0.3, 0.4, 0.5], right=True)
bin_general_labels, bin_counts = np.unique(bin_labels, return_counts=True)
bin_general_labels, bin_counts

In [None]:
plt.hist(mafs)
# np.unique(Y_train)

In [None]:
X.loc[Y_train.index].shape