In [1]:
''' Import necessary libraries'''
import os
import time
#Please fill in the input folder path and the output folder path
input_path = r'C:\Users\phuph\Desktop\breast-cancer-wisconsin.data.csv'
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score

import matplotlib.pylab as plt
%matplotlib inline
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import StratifiedKFold
import random



In [2]:
''' Read in the breast cancer dataset'''
df = pd.read_csv(input_path,header = None )
df.shape


(699, 11)

In [3]:
''' Only use a portion of the instances for estimating the noise rate'''
X_train = df.iloc[:,1:4].values
Y_train = df[10].values

In [4]:
''' Induce the CCN noise by flip rate'''
rho_po_list = [0.1,0.2,0.3,0.4,0.4]
rho_ne_list = [0.3,0.1,0.4,0.1,0.4]
Y_new_list = []
for i in range(5):
    rho_po = rho_po_list[i]
    rho_ne = rho_ne_list[i]
    Y_temp = np.copy(Y_train)
    for j in range(len(Y_temp)):
        if Y_train[j] ==2:
            temp = random.random()
            if temp < rho_po:
                Y_temp[j] = 4
        else:
            t = random.random()
            if t < rho_ne:
                Y_temp[j] = 2
    Y_new_list.append([Y_temp,rho_po,rho_ne])

In [5]:
''' Import R and KLIEP method'''
import rpy2
import rpy2.robjects as robjects
r = robjects.r
#from rpy2.robjects.packages import importr
#utils = importr("densratio")
import rpy2.robjects.numpy2ri as numpy2ri
rpy2.robjects.numpy2ri.activate()

robjects.r('''
           f <- function(t,t1) {

                    library(densratio)
                    dens <- densratio(x = t, y = t1, method = "KLIEP")
                    result <-dens$compute_density_ratio(t1)
            }
            ''')
kliep_check = robjects.globalenv['f']

In [6]:
''' KLIEP function , return the noise rate'''
def KLIEP_e(X,Y):
    e_list=[]
    X_rho_po= X[Y ==2]
    X_rho_ne= X[Y ==4]
    result_po= kliep_check(X_rho_po,X)
    result_ne= kliep_check(X_rho_ne,X)
    Po = np.asarray(result_po)
    Ne = np.asarray(result_ne)
    n = len(X)
    py_po = sum(1 for i in Y if i==2)/n
    py_ne = sum(1 for i in Y if i==4)/n
    Po_n = min(Po*py_po)
    Ne_n = min(Ne*py_ne)
    e_list.append([Ne_n,Po_n])
    return e_list
    

In [11]:
''' Estimate noise rate using KLIEP method'''
skf = StratifiedKFold(n_splits=5)
for i in Y_new_list:
    acc = []
    m =[]
    sd =[]
    Y = i[0]
    X = X_train
    for train_index,test_index in skf.split(X,Y):
        x_train, x_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        a = KLIEP_e(x_train,y_train)
        acc.append(a)
    m = np.mean(acc,axis=0)
    sd = np.std(acc,axis=0)
    print(i[1],i[2],m,sd)

0.1 0.3 [[ 0.09418433  0.18962609]] [[ 0.01598867  0.10528061]]
0.2 0.1 [[ 0.17022241  0.06905786]] [[ 0.01978758  0.03397812]]
0.3 0.4 [[ 0.28357128  0.38537659]] [[ 0.01419535  0.04565845]]
0.4 0.1 [[ 0.47546262  0.03230297]] [[ 0.00432226  0.03469755]]
0.4 0.4 [[ 0.40746121  0.34287514]] [[ 0.0381137   0.04147036]]


In [19]:
'''Estimat noise rate using cross validation'''

from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(penalty="l2")
skf1 = StratifiedKFold(n_splits=5)

for i in Y_new_list:
    Y = i[0]
    X = X_train
    rho_ne_tuned = 0
    rho_po_tuned = 0
    
    li=[]
    rho_po_list = [0.25,0.15,0.49,0.2,0.3]
    rho_ne_list = [0.35,0.1,0.3,0.25,0.15]

    for train_index,test_index in skf1.split(X,Y):
        x_train, x_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        accurate = 0
        accurate_new=0
        for z in rho_po_list:
            for j in rho_ne_list:
                alpha = float(1 - z + j) / 2
                sample_weight = (1-alpha)*np.ones(np.shape(y_train)) 
                sample_weight[y_train==0] = alpha
                clf.fit(x_train,y_train,sample_weight=sample_weight)
                pred = clf.predict(x_test)
                a = accuracy_score(y_test,pred)
                accurate_new =a
                if accurate_new > accurate:
                    accurate = accurate_new
                    rho_po_tuned= z
                    rho_ne_tuned= j
        li.append([rho_po_tuned,rho_ne_tuned])
        
    print(i[1],i[2],np.mean(li,axis=0,dtype=np.float64),np.std(li,axis=0,dtype=np.float64))
                

0.1 0.3 [ 0.25  0.3 ] [ 0.   0.1]
0.2 0.1 [ 0.23  0.35] [ 0.04  0.  ]
0.3 0.4 [ 0.278  0.3  ] [ 0.11285389  0.1       ]
0.4 0.1 [ 0.25  0.35] [ 0.  0.]
0.4 0.4 [ 0.25  0.35] [ 0.  0.]
