In [1]:
import pandas as pd
import os
import numpy as np

from lib.read_data import dataset,Datasets

In [15]:
def display_column_names(column_names,size = 4):
    '''
    when there are too many columns
    display every four column names in a line
    '''
    for i in range(1,len(column_names),size):
        print(column_names[i:i+4].values)

def add_new_column_to_cadd_gavin_tables(cadd_anno,gavin_anno):
    '''
    very specific for the case here only
    probably needs better structuring
    '''
    cadd_anno['chr_pos'] = cadd_anno.apply(lambda row: '%s_%s'%
                                         (row['#Chrom'],row['Pos']),axis=1)
    gavin_anno['chr_pos'] = gavin_anno.apply(lambda row: '%s_%s'%
                                         (row['chr'],row['pos']),axis=1)
    print('Added new columns to both tables')    

def compare_two_anno_tables(cadd_anno, gavin_anno):
    '''
    examine whether all enriched varaints are del type
    input: Pandas.DataFrame
    '''
    different_variants = {}
    diff_set = set(cadd_anno['chr_pos']) - set(gavin_anno['chr_pos'])
#     return diff_set
    for variant in diff_set:
        chr_id,pos_id = variant.split('_')
        pos_id = np.int64(pos_id)
        variant_info = cadd_anno.loc[(cadd_anno['#Chrom']==chr_id)&(cadd_anno['Pos']==pos_id),:]
        different_variants[variant] = variant_info
    return different_variants        
        
      
def read_data_set(data_table,test_size=0.25,BENCHMARK=False):
    '''
    convert a pandas dataframe data table into Datasets(dataset,dataset)
    '''
    train, test = train_test_split(data_table,test_size=0.25)
    train_x = train[[col for col in train.columns
                     if col not in ['group_PATHOGENIC','group_BENIGN']]]
    features = train_x.columns
    train_x = np.array(train_x)
    test_x = np.array(test[[col for col in train.columns
                            if col not in ['group']]])
    train_y = np.array(train['group'],dtype=np.int8)
    test_y = np.array(test['group'],dtype=np.int8)


    if BENCHMARK:
        return Datasets(train=dataset(train_x,train_y,features),
                        test=dataset(test_x,test_y,features)),\
                        train['gavin_res'],\
                        test['gavin_res']
    return Datasets(train=dataset(train_x,train_y,features),
                    test=dataset(test_x,test_y,features))

In [3]:
# load original cadd annotation
cadd_file = os.path.join('data','cadd_annotation.tsv')
dtype = {'#Chrom':np.object,
         'isDerived':np.object,
         'motifEName':np.object,
         'motifEHIPos':np.object,
         'PolyPhenCat':np.object,
         'SIFTcat':np.object}
cadd_anno = pd.read_csv(cadd_file,sep='\t',dtype=dtype)

# load gavin annotation
gavin_file = os.path.join('data','calibrationvariants_r0.4.tsv')
gavin_anno = pd.read_csv(gavin_file,sep='\t')

In [4]:
cadd_anno.head()

Unnamed: 0,#Chrom,Pos,Ref,Anc,Alt,Type,Length,isTv,isDerived,AnnoType,...,Intron,oAA,nAA,Grantham,PolyPhenCat,PolyPhenVal,SIFTcat,SIFTval,RawScore,PHRED
0,1,1167659,A,A,G,SNV,0,False,True,CodingTranscript,...,,M,V,21.0,,,deleterious,0.0,1.106787,11.25
1,1,1167674,C,C,T,SNV,0,False,True,CodingTranscript,...,,R,W,101.0,,,deleterious,0.03,4.482313,24.2
2,1,1167680,TGGC,TGGC,T,DEL,3,,True,CodingTranscript,...,,R,-,,,,,,1.924056,15.74
3,1,1167680,TGGC,TGGC,T,DEL,3,,True,Intergenic,...,,,,,,,,,1.924056,15.74
4,1,1167680,TGGC,TGGC,T,DEL,3,,True,RegulatoryFeature,...,,,,,,,,,1.924056,15.74


In [5]:
display_column_names(cadd_anno.columns)

['Pos' 'Ref' 'Anc' 'Alt']
['Type' 'Length' 'isTv' 'isDerived']
['AnnoType' 'Consequence' 'ConsScore' 'ConsDetail']
['GC' 'CpG' 'mapAbility20bp' 'mapAbility35bp']
['scoreSegDup' 'priPhCons' 'mamPhCons' 'verPhCons']
['priPhyloP' 'mamPhyloP' 'verPhyloP' 'GerpN']
['GerpS' 'GerpRS' 'GerpRSpval' 'bStatistic']
['mutIndex' 'dnaHelT' 'dnaMGW' 'dnaProT']
['dnaRoll' 'mirSVR-Score' 'mirSVR-E' 'mirSVR-Aln']
['targetScan' 'fitCons' 'cHmmTssA' 'cHmmTssAFlnk']
['cHmmTxFlnk' 'cHmmTx' 'cHmmTxWk' 'cHmmEnhG']
['cHmmEnh' 'cHmmZnfRpts' 'cHmmHet' 'cHmmTssBiv']
['cHmmBivFlnk' 'cHmmEnhBiv' 'cHmmReprPC' 'cHmmReprPCWk']
['cHmmQuies' 'EncExp' 'EncH3K27Ac' 'EncH3K4Me1']
['EncH3K4Me3' 'EncNucleo' 'EncOCC' 'EncOCCombPVal']
['EncOCDNasePVal' 'EncOCFairePVal' 'EncOCpolIIPVal' 'EncOCctcfPVal']
['EncOCmycPVal' 'EncOCDNaseSig' 'EncOCFaireSig' 'EncOCpolIISig']
['EncOCctcfSig' 'EncOCmycSig' 'Segway' 'tOverlapMotifs']
['motifDist' 'motifECount' 'motifEName' 'motifEHIPos']
['motifEScoreChng' 'TFBS' 'TFBSPeaks' 'TFBSPeaksMax'

In [6]:
gavin_anno.head()

Unnamed: 0,gene,chr,pos,ref,alt,group,effect,impact,cadd
0,NUP107,12,69107589,G,A,PATHOGENIC,splice_donor_variant,HIGH,24.7
1,NUP107,12,69090619,G,T,PATHOGENIC,missense_variant,MODERATE,28.1
2,NUP107,12,69084526,G,A,PATHOGENIC,missense_variant,MODERATE,19.16
3,NUP107,12,69129114,A,C,PATHOGENIC,missense_variant,MODERATE,31.0
4,NUP107,12,69109513,CAGAAG,C,PATHOGENIC,frameshift_variant,HIGH,35.0


In [18]:
myh7_cadd = cadd_anno.loc[cadd_anno['GeneName']=='MYH7',:].copy()
myh7_gavin = gavin_anno.loc[gavin_anno['gene']=='MYH7',:].copy()
print(myh7_cadd.shape,myh7_gavin.shape)

(930, 116) (846, 9)


In [25]:
# annotate cadd_anno with group column in gavin_anno
add_new_column_to_cadd_gavin_tables(myh7_cadd,myh7_gavin)
# a dictionary with gavin group info
chrpos_info_dic = myh7_gavin.set_index('chr_pos')['group'].to_dict()
# map the myh7_cadd with the chrpos_info_dic
def map_myh7_chrposDic(row):
    if row in chrpos_info_dic:
        return chrpos_info_dic[row]
    else:
        return 'NA'
myh7_cadd['INFO'] = myh7_cadd['chr_pos'].apply(map_myh7_chrposDic)

Added new columns to both tables


In [39]:
myh7_cadd.shape

(930, 118)

In [41]:
all_variants = myh7_cadd.drop(myh7_cadd.index[myh7_cadd.INFO=='NA'],axis=0)

In [43]:
all_variants.to_csv('data/mhy7.tsv',sep='\t',index=False)

# just for fun, try use the gavin annotation file for pathogenicity prediction

In [6]:
categorical_feature_names = \
gavin_anno.select_dtypes(include=[np.object]).columns
for col in categorical_feature_names:
    gavin_anno[col] = gavin_anno[col].astype('category')
dummy_features = pd.get_dummies(gavin_anno,
                                columns=categorical_feature_names)

In [8]:
dummy_features.shape

(346278, 14199)

In [9]:
cadd_scale = dummy_features['cadd'].max() - dummy_features['cadd'].min()
dummy_features['cadd_scaled'] = (dummy_features['cadd']-
                                 dummy_features['cadd'].min())/cadd_scale

# need a better way for scoring the data otherwise too slow, probably not in txt format

In [None]:
# save the processed table
dummy_features.to_csv(os.path.join('data','processed_gavin_annotation.tsv'),sep='\t',index=False)

# if I don't use ['chr','gene','alt','ref']

In [17]:
categorical_feature_names = ['group','impact','effect']
for col in categorical_feature_names:
    gavin_anno[col] = gavin_anno[col].astype('category')
dummy_light = pd.get_dummies(gavin_anno,
                                columns=categorical_feature_names)

In [18]:
dummy_light.head()

Unnamed: 0,gene,chr,pos,ref,alt,cadd,group_PATHOGENIC,group_POPULATION,impact_HIGH,impact_LOW,...,effect_protein_protein_contact,effect_rare_amino_acid_variant,effect_splice_acceptor_variant,effect_splice_donor_variant,effect_splice_region_variant,effect_start_lost,effect_stop_gained,effect_stop_lost,effect_stop_retained_variant,effect_synonymous_variant
0,NUP107,12,69107589,G,A,24.7,1,0,1,0,...,0,0,0,1,0,0,0,0,0,0
1,NUP107,12,69090619,G,T,28.1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NUP107,12,69084526,G,A,19.16,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NUP107,12,69129114,A,C,31.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NUP107,12,69109513,CAGAAG,C,35.0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
