# Prediction bias in annotation groups

In [1]:
import pandas as pd
import numpy as np

import os,subprocess
workdir='/nagyvinyok/adat84/sotejedlik/ribli/methylation_code/modelling'
subprocess.call(['mkdir',workdir])
os.chdir(workdir)

In [2]:
annot=pd.read_csv('../explore_data/relevant_annotations.csv',sep='\t',header=None)
annot.columns=['id','Regulatory_Feature_Group','Relation_to_UCSC_CpG_Island',
    'Strand','Infinium_Design_Type','Random_Loci','Methyl27_Loci']
annot.fillna(0,inplace=True)
annot.head()

Unnamed: 0,id,Regulatory_Feature_Group,Relation_to_UCSC_CpG_Island,Strand,Infinium_Design_Type,Random_Loci,Methyl27_Loci
0,cg00035864,0,0,F,II,0,0
1,cg00050873,0,N_Shore,R,I,0,0
2,cg00061679,0,0,R,II,0,0
3,cg00063477,0,S_Shelf,F,II,0,0
4,cg00121626,0,N_Shore,R,II,0,0


In [3]:
preds=pd.read_csv('cnn_test_preds.csv',sep='\t')
preds['tp']=np.int32((preds['label']==1) & (preds['prediction']==1))
preds['tn']=np.int32((preds['label']==0) & (preds['prediction']==0))
preds['count']=1
preds['nothing']=''
preds.head()

Unnamed: 0,id,label,prediction,error,tp,tn,count,nothing
0,cg19752143,1,1,0,1,0,1,
1,cg05219517,0,0,0,0,1,1,
2,cg05218696,1,1,0,1,0,1,
3,cg09329621,1,1,0,1,0,1,
4,cg17608706,1,1,0,1,0,1,


In [4]:
data=preds.merge(annot,on=['id'])
data.head()

Unnamed: 0,id,label,prediction,error,tp,tn,count,nothing,Regulatory_Feature_Group,Relation_to_UCSC_CpG_Island,Strand,Infinium_Design_Type,Random_Loci,Methyl27_Loci
0,cg19752143,1,1,0,1,0,1,,0,0,F,II,0,0
1,cg05219517,0,0,0,0,1,1,,Promoter_Associated,Island,R,I,0,0
2,cg05218696,1,1,0,1,0,1,,0,0,F,II,0,0
3,cg09329621,1,1,0,1,0,1,,0,0,R,II,0,0
4,cg17608706,1,1,0,1,0,1,,Gene_Associated,N_Shelf,F,II,0,0


----
## Check bias

In [5]:
def evaluate(data,var):
    group=data[[var,'count','label','prediction','error','tn','tp']].groupby(
        [var])

    agg=group.agg({'count':len, 'label':np.mean,'prediction':np.mean,
                  'error':np.mean,'tn':np.mean,'tp':np.mean}).reset_index()

    agg['spec']=agg['tn']/(1-agg['label'])
    agg['sens']=agg['tp']/agg['label']
    return agg[[var,'count','label','prediction','error','sens','spec']]

In [6]:
evaluate(data,'nothing')

Unnamed: 0,nothing,count,label,prediction,error,sens,spec
0,,20000,0.5,0.57625,0.15675,0.9195,0.767


### There is bias in "relation to CpG islands"

- No islands, islands and islands shelves have similar error


- The Shores have very bad classification which brings up overall error rate
    - This might be understandable, they are at the edge of a cluster which has correlated methylation, it can be hard to guess

In [7]:
evaluate(data,'Relation_to_UCSC_CpG_Island')

Unnamed: 0,Relation_to_UCSC_CpG_Island,count,label,prediction,error,sens,spec
0,0,6349,0.869113,0.93054,0.136084,0.95705,0.245487
1,Island,7856,0.136711,0.218814,0.151604,0.74581,0.864642
2,N_Shelf,911,0.94292,0.935236,0.086718,0.949942,0.307692
3,N_Shore,2331,0.461175,0.583012,0.230802,0.88186,0.672771
4,S_Shelf,755,0.948344,0.948344,0.074172,0.960894,0.282051
5,S_Shore,1798,0.42158,0.540044,0.226363,0.872032,0.701923


---

### No very strong bias in Regulatory feature group

- Only ones with high number : 0,Promoter assoc, unclassified.


- The model did not learn the extreme bias of promoters, it does not say 0 to all promoters
    - Of course it is not able to recover this kind of information 

In [8]:
evaluate(data,'Regulatory_Feature_Group')

Unnamed: 0,Regulatory_Feature_Group,count,label,prediction,error,sens,spec
0,0,11448,0.794287,0.787299,0.112509,0.924777,0.743524
1,Gene_Associated,40,0.725,0.875,0.25,0.931034,0.272727
2,Gene_Associated_Cell_type_specific,58,0.931034,0.913793,0.086207,0.944444,0.5
3,NonGene_Associated,71,0.056338,0.408451,0.380282,0.75,0.61194
4,NonGene_Associated_Cell_type_specific,5,0.6,1.0,0.4,1.0,0.0
5,Promoter_Associated,5534,0.021142,0.218287,0.208348,0.735043,0.792874
6,Promoter_Associated_Cell_type_specific,243,0.18107,0.374486,0.242798,0.863636,0.733668
7,Unclassified,1396,0.122493,0.36533,0.278653,0.853801,0.702857
8,Unclassified_Cell_type_specific,1205,0.40249,0.482158,0.167635,0.890722,0.793056


---

### No interesting bias below

In [9]:
evaluate(data,'Strand')

Unnamed: 0,Strand,count,label,prediction,error,sens,spec
0,F,10147,0.496797,0.583128,0.16596,0.919857,0.749315
1,R,9853,0.503298,0.569167,0.147265,0.919137,0.785452


In [10]:
evaluate(data,'Infinium_Design_Type')

Unnamed: 0,Infinium_Design_Type,count,label,prediction,error,sens,spec
0,I,6753,0.255294,0.354657,0.163631,0.87413,0.823424
1,II,13247,0.624745,0.689213,0.153242,0.928951,0.709918


In [11]:
evaluate(data,'Random_Loci')

Unnamed: 0,Random_Loci,count,label,prediction,error,sens,spec
0,0,19836,0.497278,0.574007,0.157491,0.918796,0.767048
1,True,164,0.829268,0.847561,0.067073,0.970588,0.75


In [12]:
evaluate(data,'Methyl27_Loci')

Unnamed: 0,Methyl27_Loci,count,label,prediction,error,sens,spec
0,0,18789,0.518388,0.591995,0.156528,0.920021,0.761079
1,True,1211,0.214699,0.331957,0.160198,0.9,0.823344
