# Prediction bias in annotation groups

In [4]:
import pandas as pd
import numpy as np

import os,subprocess
workdir='/nagyvinyok/adat84/sotejedlik/ribli/methylation_code/modelling'
subprocess.call(['mkdir',workdir])
os.chdir(workdir)

In [20]:
annot=pd.read_csv('../explore_data/relevant_annotations.csv',sep='\t',header=None)
annot.columns=['id','Regulatory_Feature_Group','Relation_to_UCSC_CpG_Island',
    'Strand','Infinium_Design_Type','Random_Loci','Methyl27_Loci']
annot.fillna(0,inplace=True)
annot.head()

Unnamed: 0,id,Regulatory_Feature_Group,Relation_to_UCSC_CpG_Island,Strand,Infinium_Design_Type,Random_Loci,Methyl27_Loci
0,cg00035864,0,0,F,II,0,0
1,cg00050873,0,N_Shore,R,I,0,0
2,cg00061679,0,0,R,II,0,0
3,cg00063477,0,S_Shelf,F,II,0,0
4,cg00121626,0,N_Shore,R,II,0,0


In [10]:
preds=pd.read_csv('rf_test_preds.csv',sep='\t')
preds.head()

Unnamed: 0,id,label,prediction,error
0,cg19752143,1,1,0
1,cg05219517,0,0,0
2,cg05218696,1,1,0
3,cg09329621,1,1,0
4,cg17608706,1,0,1


In [21]:
data=preds.merge(annot,on=['id'])
data.head()

Unnamed: 0,id,label,prediction,error,Regulatory_Feature_Group,Relation_to_UCSC_CpG_Island,Strand,Infinium_Design_Type,Random_Loci,Methyl27_Loci
0,cg19752143,1,1,0,0,0,F,II,0,0
1,cg05219517,0,0,0,Promoter_Associated,Island,R,I,0,0
2,cg05218696,1,1,0,0,0,F,II,0,0
3,cg09329621,1,1,0,0,0,R,II,0,0
4,cg17608706,1,0,1,Gene_Associated,N_Shelf,F,II,0,0


---

## Predictions


In [40]:
data.mean()

label         0.5000
prediction    0.5092
error         0.2652
dtype: float64

----
## Check bias

### There is bias in "relation to CpG islands"

- No islands, islands and islands shelves have similar error altough they have very different bias!


- The Shores have very bad classification which brings up overall error rate from 21% to 26%
    - This might be understandable, they are at the edge of a cluster which has correlated methylation, it can be hard to guess

In [31]:
data.groupby(['Relation_to_UCSC_CpG_Island']).count()['id']

Relation_to_UCSC_CpG_Island
0          6349
Island     7856
N_Shelf     911
N_Shore    2331
S_Shelf     755
S_Shore    1798
Name: id, dtype: int64

In [36]:
data.groupby(['Relation_to_UCSC_CpG_Island']).mean()[['label','error']].reset_index()

Unnamed: 0,Relation_to_UCSC_CpG_Island,label,error
0,0,0.869113,0.2172
1,Island,0.136711,0.215759
2,N_Shelf,0.94292,0.210757
3,N_Shore,0.461175,0.442299
4,S_Shelf,0.948344,0.206623
5,S_Shore,0.42158,0.473304


---

### No real bias in Regulatory feature group

- Only ones with high number : 0,Promoter assoc, unclassified.., they dont show very strong bias

- Actually the 0 (no information) has been classified the best!

In [29]:
data.groupby(['Regulatory_Feature_Group']).count()['id']

Regulatory_Feature_Group
0                                         11448
Gene_Associated                              40
Gene_Associated_Cell_type_specific           58
NonGene_Associated                           71
NonGene_Associated_Cell_type_specific         5
Promoter_Associated                        5534
Promoter_Associated_Cell_type_specific      243
Unclassified                               1396
Unclassified_Cell_type_specific            1205
Name: id, dtype: int64

In [37]:
data.groupby(['Regulatory_Feature_Group']).mean()[['label','error']].reset_index()

Unnamed: 0,Regulatory_Feature_Group,label,error
0,0,0.794287,0.253057
1,Gene_Associated,0.725,0.325
2,Gene_Associated_Cell_type_specific,0.931034,0.62069
3,NonGene_Associated,0.056338,0.380282
4,NonGene_Associated_Cell_type_specific,0.6,0.0
5,Promoter_Associated,0.021142,0.26798
6,Promoter_Associated_Cell_type_specific,0.18107,0.316872
7,Unclassified,0.122493,0.305874
8,Unclassified_Cell_type_specific,0.40249,0.285477


---

### No bias below

In [24]:
data.groupby(['Strand']).mean()['error']

Strand
F    0.262442
R    0.268040
Name: error, dtype: float64

In [25]:
data.groupby(['Infinium_Design_Type']).mean()['error']

Infinium_Design_Type
I     0.267585
II    0.263984
Name: error, dtype: float64

In [26]:
data.groupby(['Random_Loci']).mean()['error']

Random_Loci
0       0.265679
True    0.207317
Name: error, dtype: float64

In [27]:
data.groupby(['Methyl27_Loci']).mean()['error']

Methyl27_Loci
0       0.265049
True    0.267547
Name: error, dtype: float64