In [6]:
import pandas as pd
import os
import numpy as np

from lib.preprocessing_cadd_annotation import drop_impute

In [2]:
# print(dir(lib.preprocessing_cadd_annotation))

['__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', 'drop_impute', 'os', 'pd']


In [3]:
def load_myo5b_excel(file_path):
    excel_file = pd.ExcelFile(file_path)
    res = excel_file.parse(excel_file.sheet_names[0])
    return res

def load_cadd_annotation(file_path):
    '''
    load original cadd annotation
    '''
    dtype = {'#Chrom':np.object,
            'isDerived':np.object,
            'motifEName':np.object,
            'motifEHIPos':np.object,
            'PolyPhenCat':np.object,
            'SIFTcat':np.object}
    cadd_anno = pd.read_csv(file_path,sep='\t',dtype=dtype)
    print('Cadd annotation loaded.')
    return cadd_anno

def map_myh7_chrposDic(row):
    if row in chrpos_info_dic:
        return chrpos_info_dic[row]
    else:
        return 'NA'
    
def display_column_names(column_names,size = 4):
    '''
    when there are too many columns
    display every four column names in a line
    '''
    for i in range(1,len(column_names),size):
        print(column_names[i:i+4].values)

def add_new_column_to_cadd_gavin_tables(cadd_anno,gavin_anno):
    '''
    very specific for the case here only
    probably needs better structuring
    '''
    cadd_anno['chr_pos'] = cadd_anno.apply(lambda row: '%s_%s'%
                                         (row['#Chrom'],row['Pos']),axis=1)
    gavin_anno['chr_pos'] = gavin_anno.apply(lambda row: '%s_%s'%
                                         (row['chr'],row['pos']),axis=1)
    print('Added a chr_pos columns to both cadd_anno and gavin_anno table') 

In [7]:
if __name__=='__main__':
    cadd_path = os.path.join('data','all_variants','cadd_annotation.tsv')
    info_path = os.path.join('data','all_variants','calibrationvariants_r0.4.tsv')
    save_path = os.path.join('data','all_variants','cadd_with_info_no_nan_all_imputed.tsv')
    
    cadd_orginal = load_cadd_annotation(cadd_path)
    info_all_var = pd.read_csv(info_path,sep='\t')
    # annotate cadd_anno with group column in gavin_anno
    add_new_column_to_cadd_gavin_tables(cadd_orginal,info_all_var)

    # a dictionary with gavin group info
    chrpos_info_dic = info_all_var.set_index('chr_pos')['group'].to_dict()
    cadd_orginal['INFO'] = cadd_orginal['chr_pos'].apply(map_myh7_chrposDic)

    # drop variants annotated by cadd, but could not found info in the gavin annotation
    all_variants = cadd_orginal.drop(cadd_orginal.index[cadd_orginal.INFO=='NA'],axis=0)
    print('Discarded variants_gene that could not be found in info.')
    print('Finally we are left with gene variants info with shape:',all_variants.shape)
   
    drop_impute(all_variants,GeneName=True,POS=True,SAVE_PATH=save_path)

Cadd annotation loaded.
Added a chr_pos columns to both cadd_anno and gavin_anno table
Discarded variants_gene that could not be found in info.
Finally we are left with gene variants info with shape: (350503, 118)
The original data table has shape: (350503, 118)
Deleted features that were not used by cadd paper.
Deleted features without a single row value.
Impute with recommended values in cadd paper.
4212 Anc
69637 isTv
42 priPhCons
11208 bStatistic
70 mutIndex
320266 mirSVR-Score
320266 mirSVR-E
320266 mirSVR-Aln
350487 targetScan
11390 fitCons
758 EncH3K27Ac
1013 EncH3K4Me1
585 EncH3K4Me3
2830 EncNucleo
214 Segway
349579 motifECount
349579 motifEName
349579 motifEHIPos
349579 motifEScoreChng
68483 cDNApos
68483 relcDNApos
72796 CDSpos
72796 relCDSpos
72796 protPos
72796 relProtPos
121137 Domain
259931 Dst2Splice
259931 Dst2SplType
114605 oAA
114605 nAA
162523 Grantham
173752 PolyPhenCat
169743 SIFTcat
Deleted columns that I do not know how to impute:
Added a column with GeneName.
Ad