# Prediction bias in annotation groups

In [1]:
import pandas as pd
import numpy as np

import os,subprocess
workdir='/nagyvinyok/adat84/sotejedlik/ribli/methylation_code/modelling'
subprocess.call(['mkdir',workdir])
os.chdir(workdir)

In [2]:
annot=pd.read_csv('../explore_data/relevant_annotations.csv',sep='\t',header=None)
annot.columns=['id','Regulatory_Feature_Group','Relation_to_UCSC_CpG_Island',
    'Strand','Infinium_Design_Type','Random_Loci','Methyl27_Loci']
annot.fillna(0,inplace=True)
annot.head()

Unnamed: 0,id,Regulatory_Feature_Group,Relation_to_UCSC_CpG_Island,Strand,Infinium_Design_Type,Random_Loci,Methyl27_Loci
0,cg00035864,0,0,F,II,0,0
1,cg00050873,0,N_Shore,R,I,0,0
2,cg00061679,0,0,R,II,0,0
3,cg00063477,0,S_Shelf,F,II,0,0
4,cg00121626,0,N_Shore,R,II,0,0


In [3]:
preds=pd.read_csv('rf_test_preds.csv',sep='\t')
preds['tp']=np.int32((preds['label']==1) & (preds['prediction']==1))
preds['tn']=np.int32((preds['label']==0) & (preds['prediction']==0))
preds['count']=1
preds['nothing']=''
preds.head()

Unnamed: 0,id,label,prediction,error,tp,tn,count,nothing
0,cg19752143,1,1,0,1,0,1,
1,cg05219517,0,0,0,0,1,1,
2,cg05218696,1,1,0,1,0,1,
3,cg09329621,1,1,0,1,0,1,
4,cg17608706,1,0,1,0,0,1,


In [4]:
data=preds.merge(annot,on=['id'])
data.head()

Unnamed: 0,id,label,prediction,error,tp,tn,count,nothing,Regulatory_Feature_Group,Relation_to_UCSC_CpG_Island,Strand,Infinium_Design_Type,Random_Loci,Methyl27_Loci
0,cg19752143,1,1,0,1,0,1,,0,0,F,II,0,0
1,cg05219517,0,0,0,0,1,1,,Promoter_Associated,Island,R,I,0,0
2,cg05218696,1,1,0,1,0,1,,0,0,F,II,0,0
3,cg09329621,1,1,0,1,0,1,,0,0,R,II,0,0
4,cg17608706,1,0,1,0,0,1,,Gene_Associated,N_Shelf,F,II,0,0


----
## Check bias

In [5]:
def evaluate(data,var):
    group=data[[var,'count','label','prediction','error','tn','tp']].groupby(
        [var])

    agg=group.agg({'count':len, 'label':np.mean,'prediction':np.mean,
                  'error':np.mean,'tn':np.mean,'tp':np.mean}).reset_index()

    agg['spec']=agg['tn']/(1-agg['label'])
    agg['sens']=agg['tp']/agg['label']
    return agg[[var,'count','label','prediction','error','sens','spec']]

In [6]:
evaluate(data,'nothing')

Unnamed: 0,nothing,count,label,prediction,error,sens,spec
0,,20000,0.5,0.5092,0.2652,0.744,0.7256


### There is bias in "relation to CpG islands"

- No islands, islands and islands shelves have similar error


- The Shores have very bad classification which brings up overall error rate
    - This might be understandable, they are at the edge of a cluster which has correlated methylation, it can be hard to guess

In [7]:
evaluate(data,'Relation_to_UCSC_CpG_Island')

Unnamed: 0,Relation_to_UCSC_CpG_Island,count,label,prediction,error,sens,spec
0,0,6349,0.869113,0.824539,0.2172,0.849402,0.340554
1,Island,7856,0.136711,0.135311,0.215759,0.205773,0.875848
2,N_Shelf,911,0.94292,0.80022,0.210757,0.812573,0.403846
3,N_Shore,2331,0.461175,0.614329,0.442299,0.686512,0.447452
4,S_Shelf,755,0.948344,0.821192,0.206623,0.824022,0.230769
5,S_Shore,1798,0.42158,0.614572,0.473304,0.667546,0.424038


---

### No very strong bias in Regulatory feature group

- Only ones with high number : 0,Promoter assoc, unclassified.


- The model did not learn the extreme bias of promoters, it does not say 0 to all promoters
    - Of course it is not able to recover this kind of information 

In [8]:
evaluate(data,'Regulatory_Feature_Group')

Unnamed: 0,Regulatory_Feature_Group,count,label,prediction,error,sens,spec
0,0,11448,0.794287,0.660203,0.253057,0.756296,0.710828
1,Gene_Associated,40,0.725,0.7,0.325,0.758621,0.454545
2,Gene_Associated_Cell_type_specific,58,0.931034,0.37931,0.62069,0.37037,0.5
3,NonGene_Associated,71,0.056338,0.43662,0.380282,1.0,0.597015
4,NonGene_Associated_Cell_type_specific,5,0.6,0.6,0.0,1.0,1.0
5,Promoter_Associated,5534,0.021142,0.277918,0.26798,0.735043,0.731955
6,Promoter_Associated_Cell_type_specific,243,0.18107,0.382716,0.316872,0.681818,0.683417
7,Unclassified,1396,0.122493,0.353868,0.305874,0.695906,0.693878
8,Unclassified_Cell_type_specific,1205,0.40249,0.346058,0.285477,0.575258,0.808333


---

### No interesting bias below

In [9]:
evaluate(data,'Strand')

Unnamed: 0,Strand,count,label,prediction,error,sens,spec
0,F,10147,0.496797,0.509707,0.262442,0.748859,0.7264
1,R,9853,0.503298,0.508678,0.26804,0.73906,0.724765


In [10]:
evaluate(data,'Infinium_Design_Type')

Unnamed: 0,Infinium_Design_Type,count,label,prediction,error,sens,spec
0,I,6753,0.255294,0.269954,0.267585,0.50464,0.810499
1,II,13247,0.624745,0.631162,0.263984,0.793862,0.63971


In [11]:
evaluate(data,'Random_Loci')

Unnamed: 0,Random_Loci,count,label,prediction,error,sens,spec
0,0,19836,0.497278,0.507159,0.265679,0.742802,0.725933
1,True,164,0.829268,0.756098,0.207317,0.830882,0.607143


In [12]:
evaluate(data,'Methyl27_Loci')

Unnamed: 0,Methyl27_Loci,count,label,prediction,error,sens,spec
0,0,18789,0.518388,0.517324,0.265049,0.743326,0.725937
1,True,1211,0.214699,0.383154,0.267547,0.769231,0.722397
