In [None]:
%matplotlib inline
#這是jupyter notebook的magic word˙

import matplotlib
import matplotlib.pyplot as plt
from IPython import display

# 病理切片癌症檢測

In [None]:
import os
#判斷是否在jupyter notebook上
def is_in_ipython():
    "Is the code running in the ipython environment (jupyter including)"
    program_name = os.path.basename(os.getenv('_', ''))

    if ('jupyter-notebook' in program_name or # jupyter-notebook
        'ipython'          in program_name or # ipython
        'jupyter' in program_name or  # jupyter
        'JPY_PARENT_PID'   in os.environ):    # ipython-notebook
        return True
    else:
        return False


#判斷是否在colab上
def is_in_colab():
    if not is_in_ipython(): return False
    try:
        from google import colab
        return True
    except: return False

#判斷是否在kaggke_kernal上
def is_in_kaggle_kernal():
    if 'kaggle' in os.environ['PYTHONPATH']:
        return True
    else:
        return False

if is_in_colab():
    from google.colab import drive
    drive.mount('/content/gdrive')

In [None]:
os.environ['TRIDENT_BACKEND'] = 'pytorch'

if is_in_kaggle_kernal():
    os.environ['TRIDENT_HOME'] = './trident'
elif is_in_colab():
    os.environ['TRIDENT_HOME'] = '/content/gdrive/My Drive/trident'

#為確保安裝最新版 
!pip uninstall tridentx -y
!pip install ../input/trident/tridentx-0.7.3.22-py3-none-any.whl --upgrade

#調用trident api
import trident as T
from trident import *
from trident.backend.opencv_backend import *
from trident.models import resnet,efficientnet

In [None]:
import glob
#透過glob所全部train資料夾中所有可用圖片
imgs=glob.glob('../input/histopathologic-cancer-detection/train/*.tif')
print(len(imgs))


In [None]:
#檢視一下圖片image2array是將圖片或圖片路徑變成numpy向量，然後我們再把向量轉回image指的是pillow的bitmap圖片
print(image2array(imgs[0]).shape)
array2image(image2array(imgs[0]))

接下來要把有帶標籤的train_labels.csv讀取一下。編碼為何是'utf-8-sig'而不是'utf-8'呢?那其實是因為微軟與linux的utf-8定義不一致，一個有帶BOM一個沒有，所以為了避免誤觸地雷，建議用'utf-8-sig'，兩邊都相容比較不會有問題。讀入後逐列切割出圖檔編號與標籤，目前標籤值只有0與1，這是一個二元預測的題目。

In [None]:
if os.path.exists('./sample.pkl'):
    image_path=[]
    labels=[]
    test_image_path=[]
    test_labels=[]
    image_path,labels,test_image_path,test_labels=unpickle('./sample.pkl')
else:
    #ImageDatset(imgs,symbol='image')
    f=open('../input/histopathologic-cancer-detection/train_labels.csv','r',encoding='utf-8-sig')
    data=f.readlines()
    print(data[:3])
    image_path=[]
    labels=[]
    test_image_path=[]
    test_labels=[]
    data=data[1:] #拿掉第一筆標頭
    random.shuffle(data)#隨機洗牌
    for row in data:
        cols=row.strip().split(',') #移除\n然後逗號分割
        if random.random()<=0.3:
            test_image_path.append('../input/histopathologic-cancer-detection/train/{0}.tif'.format(cols[0]))
            test_labels.append(int(cols[1]))
        else:
            image_path.append('../input/histopathologic-cancer-detection/train/{0}.tif'.format(cols[0]))
            labels.append(int(cols[1]))
    print(len(image_path))
    print(len(labels))
    print(len(test_image_path))
    print(len(test_labels))


    from trident.data.utils import *
    pickle_it('./sample.pkl',[image_path,labels,test_image_path,test_labels])

接下來就是組裝出要使用來建模的dataset，基本上DataProvider是提供數據的接口，裡面有兩組Iterator，分別是TrainData與TestData，而Iterator控制內部的各個資料集(Dataset)如何構成批次的數據。

In [None]:
#資料集
ds1=ImageDataset(image_path,symbol='image')
ds2=LabelDataset(labels,symbol='label')

ds1_t=ImageDataset(test_image_path,symbol='image')
ds2_t=LabelDataset(test_labels,symbol='label')

#與Iterator構成data provider
data_provider=DataProvider(traindata=Iterator(data=ds1,label=ds2),testdata=Iterator(data=ds1_t,label=ds2_t))

#設定DataProvider的預處理流程
data_provider.image_transform_funcs=[Normalize(127.5,127.5)]

#即可完成設定，可以透過next()來確認數據是否正常拋出，以及是否有正確產生輸出數據的signature
img_data,label_data=data_provider.next()
print(data_provider.signature)
print(img_data.shape)
print(label_data.shape)

接下來示範一下兩個我個人很喜歡的小功能。

In [None]:
data_provider.batch_size=16
data_provider.preview_images()

In [None]:
data_provider.class_names['zh-TW']={0:'陰性',1:'陽性'}
data_provider.label_statistics()


然後我們可以透過preview_images()函數來檢閱一下圖片經過數據增強步驟之後的樣子

我們可以在data_provider.image_transform_funcs=[Normalize(127.5,127.5)]直接加入各種視覺處理的transform，來達到數據增強的目的，例如我加入常見的明暗變化、飽和度變化以及色相變化，同時加入隨機擦除以及加入胡椒鹽噪音

In [None]:
data_provider.image_transform_funcs=[
    RandomAdjustGamma(scale=(0.6,1.4)),#調整明暗
    RandomAdjustHue(scale=(-0.2,0.2)),#調整色相
    RandomAdjustSaturation(scale=(0.6,1.4)),#調整飽和度
    SaltPepperNoise(0.1),#加入胡椒鹽噪音
    RandomErasing(), #加入隨機擦去
    Resize((112,112),True), #縮放尺寸
    Normalize(127.5,127.5)] #標準化
img_data,label_data=data_provider.next()
print(img_data.shape)
print(label_data.shape)
data_provider.preview_images()

In [None]:
from trident.models import efficientnet
net1=efficientnet.EfficientNetB0(pretrained=True,include_top=True,freeze_features=True,input_shape=(3,112,112),classes=2)
net1.summary()
#在輸出添加噪音，讓機器不會做出曖昧不明的決策
net1.model[-1].add_noise=True
net1.model[-1].noise_intensity=0.12

In [None]:
net2=efficientnet.EfficientNetB0(pretrained=True,include_top=True,freeze_features=True,input_shape=(3,112,112),classes=2)
net2.model[-1].add_noise=True
net2.model[-1].noise_intensity=0.12
net2.summary()

net3=efficientnet.EfficientNetB0(pretrained=True,include_top=True,freeze_features=True,input_shape=(3,112,112),classes=2)
net3.model[-1].add_noise=True
net3.model[-1].noise_intensity=0.12




In [None]:
from sklearn.metrics import roc_curve, auc, roc_auc_score
def auc(output,target):
    
    if ndim(output)>1 and int_shape(output)[-1]>1 :
        output_np=to_numpy(exp(output))[:,1]
    elif ndim(output)>1 and int_shape(output)[-1]==1 :
        output_np=to_numpy(exp(output))[:,0]
    else:
        output_np=to_numpy(output)
    target_np=to_numpy(target)
    return roc_auc_score(target_np, output_np)



def draw_roc(training_context):
    if training_context['steps']==10 or (training_context['steps']+1)%100==0:
        traindata=training_context['train_data']
        data_feed=training_context['data_feed']
        target_np=to_numpy(traindata[data_feed['target']])
        output=traindata[data_feed['output']]
        if ndim(output)>1 and int_shape(output)[-1]>1 :
            output_np=to_numpy(exp(output))[:,1]
        elif ndim(output)>1 and int_shape(output)[-1]==1 :
            output_np=to_numpy(exp(output))[:,0]
        else:
            output_np=to_numpy(output)
        
        fpr, tpr,_=roc_curve(target_np, output_np)
        plt.figure(1)
        plt.plot([0, 1], [0, 1], 'k--')
        plt.plot(fpr, tpr, label='area = {:.3f}'.format(roc_auc_score(target_np, output_np)))
        plt.xlabel('False positive rate')
        plt.ylabel('True positive rate')
        plt.title('ROC curve')
        plt.legend(loc='best')
        plt.show()

In [None]:
#baseline
net1.with_optimizer(Adam,lr=1e-3)\
.with_loss(CrossEntropyLoss)\
.with_metric(accuracy,ignore_index=0)\
.with_metric(recall,ignore_index=0)\
.with_metric(auc)\
.with_regularizer('l2',1e-5)\
.with_model_save_path('./Models/eff0_1.pth')\
.trigger_when('on_batch_end',frequency=1,action=draw_roc)\
.unfreeze_model_scheduling(200,unit='batch',module_name='block7a')


#challenger1 使用DiffGrad優化器、累積梯度
net2.with_optimizer(DiffGrad,lr=1e-3,gradient_centralization='all')\
.with_loss(CrossEntropyLoss)\
.with_metric(accuracy,ignore_index=0)\
.with_metric(recall,ignore_index=0)\
.with_metric(auc)\
.with_regularizer('l2',1e-5)\
.with_model_save_path('./Models/eff0_2.pth')\
.trigger_when('on_batch_end',frequency=1,action=draw_roc)\
.with_accumulate_grads(5)\
.unfreeze_model_scheduling(200,unit='batch',module_name='block7a')\
.with_automatic_mixed_precision_training()



#challenger2 使用DiffGrad優化器、累積梯度、CrossEntropyLoss進階的選項以及多增加F1Score
net3.with_optimizer(DiffGrad,lr=1e-3,gradient_centralization='all')\
.with_loss(CrossEntropyLoss(auto_balance=True,label_smooth=True))\
.with_loss(F1ScoreLoss(auto_balance=True))\
.with_metric(accuracy,ignore_index=0)\
.with_metric(recall,ignore_index=0)\
.with_metric(auc)\
.with_regularizer('l2',1e-5)\
.with_model_save_path('./Models/eff0_3.pth')\
.trigger_when('on_batch_end',frequency=1,action=draw_roc)\
.unfreeze_model_scheduling(200,unit='batch',module_name='block7a')\
.with_accumulate_grads(5)\
.with_automatic_mixed_precision_training()




#if os.path.exist('./Models/eff0_1.pth'):
#    net1.load_model('./Models/eff0_1.pth')
#if os.path.exist('./Models/eff0_2.pth'):
#    net2.load_model('./Models/eff0_2.pth')
#if os.path.exist('./Models/eff0_3.pth'):
#    net3.load_model('./Models/eff0_3.pth')
#if os.path.exist('./Models/eff0_4.pth'):
#    net4.load_model('./Models/eff0_4.pth')


In [None]:

plan=TrainingPlan()\
    .add_training_item(net1,name='net1')\
    .add_training_item(net2,name='net2')\
    .add_training_item(net3,name='net3')\
    .with_data_loader(data_provider)\
    .with_batch_size(128)\
    .repeat_epochs(5)\
    .out_sample_evaluation_scheduling(100)\
    .print_gradients_scheduling(100,unit='batch')\
    .print_progress_scheduling(10,unit='batch')\
    .display_loss_metric_curve_scheduling(200)\
    .save_model_scheduling(50,unit='batch')


In [None]:
plan.start_now()