# From 50k UKB WES gVCF extract variants corresponding to CH genes

### Objective

Design the script to extract the CH gene variants from the 50K gVCF. In this notebook, 1 gVCF file is analyzed.


### INPUT DATA


- **gVCF sample file**: /workspace/datasets/ukbiobank_ch/vcf_50k/1000078_23161_0_0.gvcf.gz


- **CH gene coordinates**: /home/sdemajo/CH_UKBiobank_2021_03/results/CH_genes_canonical_transcripts_coordinates.tsv


### PROCESS

1. Read gVCF file and transform to data frame eliminating comment lines.


2. Arrange data frame and transform to BedTool object using pybedtools.


3. Get CH coordinate data and transform to BedTool object **selecting gene coordinates** (not transcript coordinates). Save resulting bed file for further use (see output).


4. INTERSECT data with **pybedtools** to obtain all rows in gvcf file that overlap with ch gene coordinates.


5. Transform to data frame and save ("output_test.maf.gz")


### OUTPUT DATA

- Bed file with **gene coordinates corresponding to CH genes**: "CH_genes_canonical_transcripts_coordinates.bed".


- Filtered variants corresponding to CH gene coordinates. This is going to be the output of the script.


### SCRIPT: extract_ch_genes.py



In [1]:
import gzip
from io import StringIO
import pandas as pd
import pybedtools

In [2]:
### Read gVCF file and transform to data frame eliminating comment lines

# Set file location
filename = "/workspace/datasets/ukbiobank_ch/vcf_50k/1000078_23161_0_0.gvcf.gz"

# Transform to pandas dataframe eliminating comment lines
lines = ''.join([line for line in gzip.open(filename, 'rt') if not line.startswith("##")])
gvcf = pd.read_csv(StringIO(lines), sep= '\t')

gvcf

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,#CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,UKB_4078594_0230791420
0,1,68991,.,A,<NON_REF>,.,.,END=69128,GT:DP:GQ:MIN_DP:PL,"0/0:0:0:0:0,0,0"
1,1,69129,.,T,<NON_REF>,.,.,END=69144,GT:DP:GQ:MIN_DP:PL,"0/0:2:6:2:0,6,49"
2,1,69145,.,T,<NON_REF>,.,.,END=69220,GT:DP:GQ:MIN_DP:PL,"0/0:1:0:0:0,0,0"
3,1,69221,.,C,<NON_REF>,.,.,END=69269,GT:DP:GQ:MIN_DP:PL,"0/0:3:6:2:0,6,78"
4,1,69270,rs201219564,A,"G,<NON_REF>",159.84,.,"DB;DP=6;ExcessHet=3.0103;MLEAC=2,0;MLEAF=1.00,...",GT:AD:DP:GQ:PL:SB,"1/1:0,6,0:6:18:188,18,0,188,18,188:0,0,6,0"
...,...,...,...,...,...,...,...,...,...,...
2288071,Y,57190840,.,N,<NON_REF>,.,.,END=57191185,GT:DP:GQ:MIN_DP:PL,"0/0:0:0:0:0,0,0"
2288072,Y,57191698,.,N,<NON_REF>,.,.,END=57192099,GT:DP:GQ:MIN_DP:PL,"0/0:0:0:0:0,0,0"
2288073,Y,57192503,.,N,<NON_REF>,.,.,END=57192808,GT:DP:GQ:MIN_DP:PL,"0/0:0:0:0:0,0,0"
2288074,Y,57193943,.,N,<NON_REF>,.,.,END=57194227,GT:DP:GQ:MIN_DP:PL,"0/0:0:0:0:0,0,0"


In [3]:
# Get file name only
# file = filename.split("/")[-1].split(".")[0]

In [4]:
# Add new position
gvcf.insert(loc = 2, column = 'POS2', value = gvcf["POS"])
gvcf

Unnamed: 0,#CHROM,POS,POS2,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,UKB_4078594_0230791420
0,1,68991,68991,.,A,<NON_REF>,.,.,END=69128,GT:DP:GQ:MIN_DP:PL,"0/0:0:0:0:0,0,0"
1,1,69129,69129,.,T,<NON_REF>,.,.,END=69144,GT:DP:GQ:MIN_DP:PL,"0/0:2:6:2:0,6,49"
2,1,69145,69145,.,T,<NON_REF>,.,.,END=69220,GT:DP:GQ:MIN_DP:PL,"0/0:1:0:0:0,0,0"
3,1,69221,69221,.,C,<NON_REF>,.,.,END=69269,GT:DP:GQ:MIN_DP:PL,"0/0:3:6:2:0,6,78"
4,1,69270,69270,rs201219564,A,"G,<NON_REF>",159.84,.,"DB;DP=6;ExcessHet=3.0103;MLEAC=2,0;MLEAF=1.00,...",GT:AD:DP:GQ:PL:SB,"1/1:0,6,0:6:18:188,18,0,188,18,188:0,0,6,0"
...,...,...,...,...,...,...,...,...,...,...,...
2288071,Y,57190840,57190840,.,N,<NON_REF>,.,.,END=57191185,GT:DP:GQ:MIN_DP:PL,"0/0:0:0:0:0,0,0"
2288072,Y,57191698,57191698,.,N,<NON_REF>,.,.,END=57192099,GT:DP:GQ:MIN_DP:PL,"0/0:0:0:0:0,0,0"
2288073,Y,57192503,57192503,.,N,<NON_REF>,.,.,END=57192808,GT:DP:GQ:MIN_DP:PL,"0/0:0:0:0:0,0,0"
2288074,Y,57193943,57193943,.,N,<NON_REF>,.,.,END=57194227,GT:DP:GQ:MIN_DP:PL,"0/0:0:0:0:0,0,0"


In [5]:
### Transform to BedTool object
# Using pybedtools

gvcf_bed = pybedtools.BedTool.from_dataframe(gvcf)
gvcf_bed.head()

1	68991	68991	.	A	<NON_REF>	.	.	END=69128	GT:DP:GQ:MIN_DP:PL	0/0:0:0:0:0,0,0
 1	69129	69129	.	T	<NON_REF>	.	.	END=69144	GT:DP:GQ:MIN_DP:PL	0/0:2:6:2:0,6,49
 1	69145	69145	.	T	<NON_REF>	.	.	END=69220	GT:DP:GQ:MIN_DP:PL	0/0:1:0:0:0,0,0
 1	69221	69221	.	C	<NON_REF>	.	.	END=69269	GT:DP:GQ:MIN_DP:PL	0/0:3:6:2:0,6,78
 1	69270	69270	rs201219564	A	G,<NON_REF>	159.84	.	DB;DP=6;ExcessHet=3.0103;MLEAC=2,0;MLEAF=1.00,0.00;RAW_MQandDP=5700,6	GT:AD:DP:GQ:PL:SB	1/1:0,6,0:6:18:188,18,0,188,18,188:0,0,6,0
 1	69271	69271	.	C	<NON_REF>	.	.	END=69300	GT:DP:GQ:MIN_DP:PL	0/0:6:12:6:0,12,180
 1	69301	69301	.	G	<NON_REF>	.	.	END=69324	GT:DP:GQ:MIN_DP:PL	0/0:9:21:7:0,21,255
 1	69325	69325	.	T	<NON_REF>	.	.	END=69325	GT:DP:GQ:MIN_DP:PL	0/0:10:19:10:0,19,336
 1	69326	69326	.	T	<NON_REF>	.	.	END=69403	GT:DP:GQ:MIN_DP:PL	0/0:11:22:8:0,22,309
 1	69404	69404	.	T	<NON_REF>	.	.	END=69486	GT:DP:GQ:MIN_DP:PL	0/0:38:60:20:0,60,746
 

In [6]:
# check length
print(len(gvcf_bed))

2288076


In [7]:
### Obtain CH coordinate data

ch_ctrans_coord = pd.read_csv("/home/sdemajo/CH_UKBiobank_2021_03/results/CH_genes_canonical_transcripts_coordinates.tsv",
                              sep="\t")

ch_ctrans_coord

Unnamed: 0,Gene stable ID,Gene stable ID version,Transcript stable ID,Transcript stable ID version,Protein stable ID,Protein stable ID version,Chromosome/scaffold name,Gene start (bp),Gene end (bp),Transcript start (bp),Transcript end (bp),Transcription start site (TSS),Transcript length (including UTRs and CDS),CDS Length,Strand,Gene name
0,ENSG00000099949,ENSG00000099949.21,ENST00000646124,ENST00000646124.2,ENSP00000496779,ENSP00000496779.1,22,20982269,20999032,20982297,20999032,20982297,4282,2523,1,LZTR1
1,ENSG00000159216,ENSG00000159216.19,ENST00000300305,ENST00000300305.7,ENSP00000300305,ENSP00000300305.3,21,34787801,36004667,34787801,35049344,35049344,6222,1443,-1,RUNX1
2,ENSG00000096968,ENSG00000096968.14,ENST00000381652,ENST00000381652.4,ENSP00000371067,ENSP00000371067.4,9,4984390,5129948,4985272,5129948,4985272,7023,3399,1,JAK2
3,ENSG00000169249,ENSG00000169249.13,ENST00000307771,ENST00000307771.8,ENSP00000303015,ENSP00000303015.7,X,15790472,15823260,15790484,15823260,15790484,1479,1449,1,ZRSR2
4,ENSG00000153707,ENSG00000153707.17,ENST00000381196,ENST00000381196.8,ENSP00000370593,ENSP00000370593.3,9,8314246,10612723,8314246,10033790,10033790,9911,5739,-1,PTPRD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,ENSG00000073282,ENSG00000073282.14,ENST00000264731,ENST00000264731.8,ENSP00000264731,ENSP00000264731.3,3,189631389,189897276,189631389,189897276,189631389,4944,2043,1,TP63
61,ENSG00000161547,ENSG00000161547.17,ENST00000392485,ENST00000392485.2,ENSP00000376276,ENSP00000376276.2,17,76734115,76737333,76734115,76737333,76737333,2885,666,-1,SRSF2
62,ENSG00000138413,ENSG00000138413.14,ENST00000415913,ENST00000415913.5,ENSP00000390265,ENSP00000390265.1,2,208236229,208266074,208236265,208254322,208254322,2441,1245,-1,IDH1
63,ENSG00000143322,ENSG00000143322.21,ENST00000502732,ENST00000502732.6,ENSP00000427562,ENSP00000427562.1,1,179099330,179229684,179099330,179229677,179229677,12217,3549,-1,ABL2


In [8]:
## Transform to BedTool object
## Take gene coordinates! (not transcript)

ch_ctrans_coord_bed = pybedtools.BedTool.from_dataframe(ch_ctrans_coord.iloc[:, [6,7,8,15]])
ch_ctrans_coord_bed.head()

# Save
ch_ctrans_coord_bed.saveas("CH_genes_canonical_transcripts_coordinates.bed")

22	20982269	20999032	LZTR1
 21	34787801	36004667	RUNX1
 9	4984390	5129948	JAK2
 X	15790472	15823260	ZRSR2
 9	8314246	10612723	PTPRD
 22	28687743	28742422	CHEK2
 20	32358330	32439319	ASXL1
 20	32762385	32809356	DNMT3B
 21	43092956	43107570	U2AF1
 X	44873188	45112779	KDM6A
 

<BedTool(CH_genes_canonical_transcripts_coordinates.bed)>

In [9]:
# Open directly bed file
ch_ctrans_coord_bed = pybedtools.BedTool("CH_genes_canonical_transcripts_coordinates.bed")
ch_ctrans_coord_bed.head()

22	20982269	20999032	LZTR1
 21	34787801	36004667	RUNX1
 9	4984390	5129948	JAK2
 X	15790472	15823260	ZRSR2
 9	8314246	10612723	PTPRD
 22	28687743	28742422	CHEK2
 20	32358330	32439319	ASXL1
 20	32762385	32809356	DNMT3B
 21	43092956	43107570	U2AF1
 X	44873188	45112779	KDM6A
 

In [10]:
### INTERSECT DATA (PYBEDTOOLS)

# Intersect using "u=True" to obtain all rows in gvcf_bed that overlap with ch_ctrans_coord_bed
vcf_chgenes = gvcf_bed.intersect(ch_ctrans_coord_bed, u=True)

# Show results
vcf_chgenes.head()


  self.stderr = io.open(errread, 'rb', bufsize)


1	1787231	1787231	.	T	<NON_REF>	.	.	END=1787255	GT:DP:GQ:MIN_DP:PL	0/0:4:9:3:0,9,99
 1	1787256	1787256	.	T	<NON_REF>	.	.	END=1787259	GT:DP:GQ:MIN_DP:PL	0/0:8:21:8:0,21,315
 1	1787260	1787260	.	A	<NON_REF>	.	.	END=1787265	GT:DP:GQ:MIN_DP:PL	0/0:8:18:7:0,18,270
 1	1787266	1787266	.	C	<NON_REF>	.	.	END=1787314	GT:DP:GQ:MIN_DP:PL	0/0:12:21:7:0,21,270
 1	1787315	1787315	.	G	<NON_REF>	.	.	END=1787321	GT:DP:GQ:MIN_DP:PL	0/0:22:60:21:0,60,900
 1	1787322	1787322	.	C	<NON_REF>	.	.	END=1787322	GT:DP:GQ:MIN_DP:PL	0/0:22:57:22:0,57,855
 1	1787323	1787323	.	T	<NON_REF>	.	.	END=1787333	GT:DP:GQ:MIN_DP:PL	0/0:26:63:24:0,63,945
 1	1787334	1787334	.	G	<NON_REF>	.	.	END=1787334	GT:DP:GQ:MIN_DP:PL	0/0:27:44:27:0,44,1000
 1	1787335	1787335	.	T	<NON_REF>	.	.	END=1787442	GT:DP:GQ:MIN_DP:PL	0/0:62:63:27:0,63,945
 1	1787443	1787443	.	A	<NON_REF>	.	.	END=1787517	GT:DP:GQ:MIN_DP:PL	0/0:12:21:8:0,21,315
 

In [11]:
print(len(vcf_chgenes))

15249


In [12]:
gvcf.columns

Index(['#CHROM', 'POS', 'POS2', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO',
       'FORMAT', 'UKB_4078594_0230791420'],
      dtype='object')

In [13]:
### Transform to pandas data frame

# Transform to data frame
vcf_chgenes_df = vcf_chgenes.to_dataframe()
vcf_chgenes_df.columns = gvcf.columns
vcf_chgenes_df.iloc[239:250,]

Unnamed: 0,#CHROM,POS,POS2,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,UKB_4078594_0230791420
239,1,114714016,114714016,.,A,<NON_REF>,.,.,END=114714047,GT:DP:GQ:MIN_DP:PL,"0/0:2:6:2:0,6,52"
240,1,114714048,114714048,rs969273,G,"A,<NON_REF>",49.56,.,"DB;DP=2;ExcessHet=3.0103;MLEAC=2,0;MLEAF=1.00,...",GT:AD:DP:GQ:PL:SB,"1/1:0,2,0:2:6:76,6,0,76,6,76:0,0,0,2"
241,1,114714049,114714049,.,C,<NON_REF>,.,.,END=114714061,GT:DP:GQ:MIN_DP:PL,"0/0:2:6:2:0,6,70"
242,1,114714062,114714062,.,A,<NON_REF>,.,.,END=114714072,GT:DP:GQ:MIN_DP:PL,"0/0:2:3:2:0,3,45"
243,1,114714073,114714073,.,A,<NON_REF>,.,.,END=114714078,GT:DP:GQ:MIN_DP:PL,"0/0:2:6:2:0,6,78"
244,1,114715950,114715950,.,T,<NON_REF>,.,.,END=114715972,GT:DP:GQ:MIN_DP:PL,"0/0:6:12:5:0,12,180"
245,1,114715973,114715973,.,T,<NON_REF>,.,.,END=114715973,GT:DP:GQ:MIN_DP:PL,"0/0:7:21:7:0,21,261"
246,1,114715974,114715974,.,T,<NON_REF>,.,.,END=114715974,GT:DP:GQ:MIN_DP:PL,"0/0:6:18:6:0,18,229"
247,1,114715975,114715975,.,A,<NON_REF>,.,.,END=114715984,GT:DP:GQ:MIN_DP:PL,"0/0:7:21:7:0,21,253"
248,1,114715985,114715985,.,A,<NON_REF>,.,.,END=114715987,GT:DP:GQ:MIN_DP:PL,"0/0:7:18:7:0,18,270"


In [14]:
# Save file
vcf_chgenes_df.to_csv("output_test.maf.gz",
                      sep="\t", index = False, compression='gzip')

In [15]:
# Reopen file
pd.read_csv("output_test.maf.gz", sep= '\t')

Unnamed: 0,#CHROM,POS,POS2,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,UKB_4078594_0230791420
0,1,1787231,1787231,.,T,<NON_REF>,.,.,END=1787255,GT:DP:GQ:MIN_DP:PL,"0/0:4:9:3:0,9,99"
1,1,1787256,1787256,.,T,<NON_REF>,.,.,END=1787259,GT:DP:GQ:MIN_DP:PL,"0/0:8:21:8:0,21,315"
2,1,1787260,1787260,.,A,<NON_REF>,.,.,END=1787265,GT:DP:GQ:MIN_DP:PL,"0/0:8:18:7:0,18,270"
3,1,1787266,1787266,.,C,<NON_REF>,.,.,END=1787314,GT:DP:GQ:MIN_DP:PL,"0/0:12:21:7:0,21,270"
4,1,1787315,1787315,.,G,<NON_REF>,.,.,END=1787321,GT:DP:GQ:MIN_DP:PL,"0/0:22:60:21:0,60,900"
...,...,...,...,...,...,...,...,...,...,...,...
15244,X,124422487,124422487,.,T,<NON_REF>,.,.,END=124422492,GT:DP:GQ:MIN_DP:PL,"0/0:23:54:22:0,54,810"
15245,X,124422493,124422493,.,C,<NON_REF>,.,.,END=124422580,GT:DP:GQ:MIN_DP:PL,"0/0:34:60:24:0,60,878"
15246,X,124422581,124422581,.,C,<NON_REF>,.,.,END=124422581,GT:DP:GQ:MIN_DP:PL,"0/0:28:48:28:0,48,946"
15247,X,124422582,124422582,.,A,<NON_REF>,.,.,END=124422584,GT:DP:GQ:MIN_DP:PL,"0/0:25:60:25:0,60,900"


## Command python

python extract_ch_genes.py -i 1000078_23161_0_0.gvcf.gz -i_coor CH_genes_canonical_transcripts_coordinates.bed -o results

# TESTS

----
---

In [16]:
# Select only rows with rs

vcf_chgenes_rs = vcf_chgenes_df[vcf_chgenes_df["ID"].str.startswith("rs")]
vcf_chgenes_rs

Unnamed: 0,#CHROM,POS,POS2,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,UKB_4078594_0230791420
134,1,43346404,43346404,rs1760670,G,"A,<NON_REF>",278.77,.,BaseQRankSum=-4.294;DB;DP=28;ExcessHet=3.0103;...,GT:AD:DP:GQ:PL:SB,"0/1:17,11,0:28:99:307,0,621,358,654,1012:16,1,..."
160,1,43349193,43349193,rs839995,T,"C,<NON_REF>",112.77,.,BaseQRankSum=2.450;DB;DP=8;ExcessHet=3.0103;ML...,GT:AD:DP:GQ:PL:SB,"0/1:4,4,0:8:96:141,0,96,153,108,261:2,2,1,3"
240,1,114714048,114714048,rs969273,G,"A,<NON_REF>",49.56,.,"DB;DP=2;ExcessHet=3.0103;MLEAC=2,0;MLEAF=1.00,...",GT:AD:DP:GQ:PL:SB,"1/1:0,2,0:2:6:76,6,0,76,6,76:0,0,0,2"
312,1,179117284,179117284,rs779694937,TA,"T,<NON_REF>",0.16,.,BaseQRankSum=1.231;DB;DP=11;ExcessHet=3.0103;M...,GT:AD:DP:GQ:PL:SB,"0/1:9,2,0:11:23:23,0,219,50,225,276:9,0,2,0"
330,1,179118799,179118799,rs2171959,T,"C,<NON_REF>",569.77,.,"DB;DP=18;ExcessHet=3.0103;MLEAC=2,0;MLEAF=1.00...",GT:AD:DP:GQ:PL:SB,"1/1:0,18,0:18:54:598,54,0,598,54,598:0,0,1,17"
...,...,...,...,...,...,...,...,...,...,...,...
14693,X,124061185,124061185,rs2297651,A,"C,<NON_REF>",200.84,.,"DB;DP=6;ExcessHet=3.0103;MLEAC=2,0;MLEAF=1.00,...",GT:AD:DP:GQ:PL:SB,"1/1:0,6,0:6:18:229,18,0,229,18,229:0,0,6,0"
14712,X,124061743,124061743,rs755358841,C,"CTT,<NON_REF>",63.86,.,BaseQRankSum=0.210;DB;DP=10;ExcessHet=3.0103;M...,GT:AD:DP:GQ:PL:SB,"0/1:1,5,0:6:2:99,0,2,103,17,119:1,0,5,0"
14777,X,124066143,124066143,rs752878823,AT,"A,<NON_REF>",96.73,.,BaseQRankSum=1.668;DB;DP=19;ExcessHet=3.0103;M...,GT:AD:DP:GQ:PGT:PID:PL:SB,"0/1:4,6,0:10:66:0|1:124066143_AT_A:134,0,66,14..."
14817,X,124071128,124071128,rs759815507,CT,"C,CTT,<NON_REF>",0.06,.,BaseQRankSum=1.335;DB;DP=13;ExcessHet=3.0103;M...,GT:AD:DP:GQ:PL:SB,"0/1:8,2,2,0:12:3:17,0,190,3,131,191,48,189,191..."


In [17]:
# Check info

#vcf_chgenes_rs[["GT","AD","DP","GQ","PL","SB"]] = vcf_chgenes_rs["UKB_4078594_0230791420"].str.split(":",expand=True)

splitdf = vcf_chgenes_rs["UKB_4078594_0230791420"].str.split(":",expand=True)

splitdf.columns = ["GT","AD","DP","GQ","PL","SB", "x","xx"]

splitdf

Unnamed: 0,GT,AD,DP,GQ,PL,SB,x,xx
134,0/1,17110,28,99,30706213586541012,161101,,
160,0/1,440,8,96,141096153108261,2213,,
240,1/1,020,2,6,766076676,0002,,
312,0/1,920,11,23,23021950225276,9020,,
330,1/1,0180,18,54,59854059854598,00117,,
...,...,...,...,...,...,...,...,...
14693,1/1,060,6,18,22918022918229,0060,,
14712,0/1,150,6,2,990210317119,1050,,
14777,0/1,460,10,66,0|1,124066143_AT_A,13406614584229,4060
14817,0/1,8220,12,3,170190313119148189191241,7140,,


In [18]:
splitdf2 = vcf_chgenes_rs["INFO"].str.split(";",expand=True)

splitdf2

Unnamed: 0,0,1,2,3,4,5,6,7,8
134,BaseQRankSum=-4.294,DB,DP=28,ExcessHet=3.0103,"MLEAC=1,0","MLEAF=0.500,0.00",MQRankSum=0.000,"RAW_MQandDP=100800,28",ReadPosRankSum=-0.401
160,BaseQRankSum=2.450,DB,DP=8,ExcessHet=3.0103,"MLEAC=1,0","MLEAF=0.500,0.00",MQRankSum=0.000,"RAW_MQandDP=28800,8",ReadPosRankSum=0.545
240,DB,DP=2,ExcessHet=3.0103,"MLEAC=2,0","MLEAF=1.00,0.00","RAW_MQandDP=7200,2",,,
312,BaseQRankSum=1.231,DB,DP=11,ExcessHet=3.0103,"MLEAC=1,0","MLEAF=0.500,0.00",MQRankSum=0.000,"RAW_MQandDP=39600,11",ReadPosRankSum=0.967
330,DB,DP=18,ExcessHet=3.0103,"MLEAC=2,0","MLEAF=1.00,0.00","RAW_MQandDP=64800,18",,,
...,...,...,...,...,...,...,...,...,...
14693,DB,DP=6,ExcessHet=3.0103,"MLEAC=2,0","MLEAF=1.00,0.00","RAW_MQandDP=21600,6",,,
14712,BaseQRankSum=0.210,DB,DP=10,ExcessHet=3.0103,"MLEAC=1,0","MLEAF=0.500,0.00",MQRankSum=0.000,"RAW_MQandDP=36000,10",ReadPosRankSum=1.383
14777,BaseQRankSum=1.668,DB,DP=19,ExcessHet=3.0103,"MLEAC=1,0","MLEAF=0.500,0.00",MQRankSum=0.000,"RAW_MQandDP=68400,19",ReadPosRankSum=-0.480
14817,BaseQRankSum=1.335,DB,DP=13,ExcessHet=3.0103,"MLEAC=1,0,0","MLEAF=0.500,0.00,0.00",MQRankSum=0.000,"RAW_MQandDP=46800,13",ReadPosRankSum=-0.165


In [19]:
# Merge data (test)
newdf = vcf_chgenes_rs.iloc[:, [0,1,3,4,5]]


newdf2 = pd.concat([newdf, splitdf, splitdf2], axis=1)
newdf2.head(10)

Unnamed: 0,#CHROM,POS,ID,REF,ALT,GT,AD,DP,GQ,PL,...,xx,0,1,2,3,4,5,6,7,8
134,1,43346404,rs1760670,G,"A,<NON_REF>",0/1,17110,28,99,30706213586541012,...,,BaseQRankSum=-4.294,DB,DP=28,ExcessHet=3.0103,"MLEAC=1,0","MLEAF=0.500,0.00",MQRankSum=0.000,"RAW_MQandDP=100800,28",ReadPosRankSum=-0.401
160,1,43349193,rs839995,T,"C,<NON_REF>",0/1,440,8,96,141096153108261,...,,BaseQRankSum=2.450,DB,DP=8,ExcessHet=3.0103,"MLEAC=1,0","MLEAF=0.500,0.00",MQRankSum=0.000,"RAW_MQandDP=28800,8",ReadPosRankSum=0.545
240,1,114714048,rs969273,G,"A,<NON_REF>",1/1,20,2,6,766076676,...,,DB,DP=2,ExcessHet=3.0103,"MLEAC=2,0","MLEAF=1.00,0.00","RAW_MQandDP=7200,2",,,
312,1,179117284,rs779694937,TA,"T,<NON_REF>",0/1,920,11,23,23021950225276,...,,BaseQRankSum=1.231,DB,DP=11,ExcessHet=3.0103,"MLEAC=1,0","MLEAF=0.500,0.00",MQRankSum=0.000,"RAW_MQandDP=39600,11",ReadPosRankSum=0.967
330,1,179118799,rs2171959,T,"C,<NON_REF>",1/1,180,18,54,59854059854598,...,,DB,DP=18,ExcessHet=3.0103,"MLEAC=2,0","MLEAF=1.00,0.00","RAW_MQandDP=64800,18",,,
353,1,179120340,rs9726961,C,"G,<NON_REF>",1/1,60,6,18,25118025118251,...,,DB,DP=6,ExcessHet=3.0103,"MLEAC=2,0","MLEAF=1.00,0.00","RAW_MQandDP=21600,6",,,
356,1,179120356,rs9728480,A,"C,<NON_REF>",1/1,70,7,21,29721029721297,...,,DB,DP=7,ExcessHet=3.0103,"MLEAC=2,0","MLEAF=1.00,0.00","RAW_MQandDP=25200,7",,,
380,1,179121915,rs754763760,ATTTTTTT,"A,<NON_REF>",0/1,130,4,6,1090611217128,...,,BaseQRankSum=-0.674,DB,DP=16,ExcessHet=3.0103,"MLEAC=1,0","MLEAF=0.500,0.00",MQRankSum=-0.674,"RAW_MQandDP=54116,16",
440,1,204525308,rs3789051,C,"T,<NON_REF>",0/1,26240,50,99,58606496647201384,...,,BaseQRankSum=-0.350,DB,DP=50,ExcessHet=3.0103,"MLEAC=1,0","MLEAF=0.500,0.00",MQRankSum=0.000,"RAW_MQandDP=180000,50",ReadPosRankSum=0.185
455,1,204525628,rs368369181,G,"GT,<NON_REF>",0/1,430,7,53,530836592157,...,,BaseQRankSum=-1.068,DB,DP=11,ExcessHet=3.0103,"MLEAC=1,0","MLEAF=0.500,0.00",MQRankSum=0.000,"RAW_MQandDP=39600,11",ReadPosRankSum=1.368


In [20]:
# ------------------------------------------------------------------------------------------

#bed = pybedtools.BedTool('/workspace/datasets/ukbiobank_ch/vcf_50k/1000078_23161_0_0.gvcf.gz')
#bed.head()
