In [None]:
%matplotlib inline
#這是juoyter notebook的magic word˙

import matplotlib
import matplotlib.pyplot as plt
from IPython import display

In [None]:
import os
#判斷是否在jupyter notebook上
def is_in_ipython():
    "Is the code running in the ipython environment (jupyter including)"
    program_name = os.path.basename(os.getenv('_', ''))

    if ('jupyter-notebook' in program_name or # jupyter-notebook
        'ipython'          in program_name or # ipython
        'jupyter' in program_name or  # jupyter
        'JPY_PARENT_PID'   in os.environ):    # ipython-notebook
        return True
    else:
        return False


#判斷是否在colab上
def is_in_colab():
    if not is_in_ipython(): return False
    try:
        from google import colab
        return True
    except: return False

#判斷是否在kaggke_kernal上
def is_in_kaggle_kernal():
    if 'kaggle' in os.environ['PYTHONPATH']:
        return True
    else:
        return False

if is_in_colab():
    from google.colab import drive
    drive.mount('/content/gdrive')

In [None]:
os.environ['TRIDENT_BACKEND'] = 'pytorch'

if is_in_kaggle_kernal():
    os.environ['TRIDENT_HOME'] = './trident'
    
elif is_in_colab():
    os.environ['TRIDENT_HOME'] = '/content/gdrive/My Drive/trident'

#為確保安裝最新版 
!pip uninstall tridentx -y
!pip install tridentx --upgrade

import copy
import numpy as np
#調用trident api
import trident as T
from trident import *
from trident.models import resnet,efficientnet

In [None]:
import glob
#透過glob所全部train資料夾中所有可用圖片
imgs=glob.glob('../input/histopathologic-cancer-detection/train/*.tif')
print(len(imgs))

#ImageDatset(imgs,symbol='image')
f=open('../input/histopathologic-cancer-detection/train_labels.csv','r',encoding='utf-8-sig')
rows=f.readlines()
print(rows[:3])
image_path=[]
labels=[]
test_image_path=[]
test_labels=[]

rows=rows[1:] #拿掉第一筆標頭
random.shuffle(rows)#隨機洗牌
for row in rows:
    cols=row.strip().split(',') #移除\n然後逗號分割
    if random.random()<=0.3:
        test_image_path.append('../input/histopathologic-cancer-detection/train/{0}.tif'.format(cols[0]))
        test_labels.append(int(cols[1]))
    else:
        image_path.append('../input/histopathologic-cancer-detection/train/{0}.tif'.format(cols[0]))
        labels.append(int(cols[1]))
print(len(image_path))
print(len(labels))
print(len(test_image_path))
print(len(test_labels))


In [None]:
#資料集
ds1=ImageDataset(image_path,symbol='image')
ds2=LabelDataset(labels,symbol='label')

ds1_t=ImageDataset(test_image_path,symbol='image')
ds2_t=LabelDataset(test_labels,symbol='label')

#與Iterator構成data provider
data_provider=DataProvider(traindata=Iterator(data=ds1,label=ds2),testdata=Iterator(data=ds1_t,label=ds2_t))

#設定DataProvider的預處理流程
data_provider.image_transform_funcs=[Normalize(127.5,127.5)]

#即可完成設定，可以透過next()來確認數據是否正常拋出，以及是否有正確產生輸出數據的signature
img_data,label_data=data_provider.next()
print(data_provider.signature)
print(img_data.shape)
print(label_data.shape)

data_provider.image_transform_funcs=[
    RandomAdjustGamma(scale=(0.6,1.4)),#調整明暗
    RandomAdjustHue(scale=(-0.5,0.5)),#調整色相
    RandomAdjustSaturation(scale=(0.6,1.4)),#調整飽和度
    SaltPepperNoise(0.05),#加入胡椒鹽噪音
    RandomErasing(), #加入隨機擦去
    Normalize(127.5,127.5)] #標準化

#測試集數據不需要做數據增強
data_provider.testdata.data.image_transform_funcs=[
    Normalize(127.5,127.5)] #標準化

img_data,label_data=data_provider.next()
print(img_data.shape)
print(label_data.shape)
test_img_data,test_label_data=data_provider.next_test()
print(test_img_data.shape)
print(test_label_data.shape)
data_provider.preview_images()

In [None]:
from trident.models import efficientnet,densenet,resnet
net1=efficientnet.EfficientNetB0(pretrained=True,include_top=True,freeze_features=True,input_shape=(3,96,96),classes=2)
net1.model[-1].add_noise=True
net1.model[-1].noise_intensity=0.12


如果你仔細競賽規則，裡面有對圖像的標註作明確的說明，每張96x96的圖像所謂的陽性，是指在圖像中心點的32x32區域中是否有癌症細胞，在此區域外即使有癌症細胞也算是陰性，這個規則可以說是頗奇葩，而我在討論中看起來沒有太多人討論這個，只有少部分的人說它改成切出中心32x32區域跑結果很爛，其實不用做就知道會很爛。  
- 現在預訓練模型基本上都是基於224*224，32x32這尺寸太小  
- 直接使用32x32，周圍區域會被0填滿的行為所影響，若改其他padding又會讓預訓練模型失效  
- 如果切32x32，很多看起來是白白的純黑的一片，因為它可能位於一個大的腫瘤內部，反而喪失了識別能力，它需要外圍圖像的上下文信息。  

所以為了解決這問題，我自己設計了一個自定義層CustomCropFlatten，傳統卷積層切換至全連接層，多半是使用Flattened攤平，但是在這題中，我們就會把32x32跟外圍區域的信息混再一起。由於我希望鎖定32x32區域有無腫瘤，但我又要上下文信息，所以我首先檢查一下如果我用96*96，丟到預訓練模型到底圖片會縮多小，由於32是96的三分之一，所以我想要找到會把它縮成6x6的位置，這樣六個像素分別為  \[padding區\]-\[周圍區\]-\[觀測區\]-\[觀測區\]-\[周圍區\]-\[padding區\]。

首先我假設padding效果應該不至於占掉左右總和1/3，所以上述的padding區是我要丟棄的，而周圍區才是我關心的上下文信息，而觀測區已經從32*32變成2*2，所以這個字定義層先把2*2區域挖出來攤平，接著把觀測區的上下左右四塊周圍區取出來，每塊各自取逐通道的極大值(取出特徵的極大值)，等於空間上這是四個點而非一大塊面積，這樣在重要性上會被抑制，但是特徵還是可以供後續分類器參考

In [None]:
class CustomCropFlatten(Layer):
    """
  
    """
    def __init__(self,name='CustomCropFlatten'):
        super(CustomCropFlatten, self).__init__()
        self.name = name

    def forward(self, x, **kwargs):
        #(None, 112, 6, 6)
        #target_area=2x2
        target=x[:,:,2:4,2:4]
        B,C,H,W=int_shape(target)
        outside1=reduce_max(x[:,:,1,1:5],axis=-1)
        outside2=reduce_max(x[:,:,4,1:5],axis=-1)
        outside3=reduce_max(x[:,:,1:5,1],axis=-1)
        outside4=reduce_max(x[:,:,1:5,4],axis=-1)
        outside=stack([outside1,outside2,outside3,outside4],axis=-1)
        target=reshape(target,(B,C,-1))
        return reshape(concate([target,outside],axis=-1),(B,-1))

net2=efficientnet.EfficientNetB0(pretrained=True,include_top=False,freeze_features=True,input_shape=(3,96,96),classes=2)
net2.model.remove_at(-1)
net2.model.remove_at(-1)
net2.model.remove_at(-1)
net2.model.remove_at(-1)
net2.model.remove_at(-1)
net2.model.remove_at(-1)
net2.model.add_module('custom',CustomCropFlatten())
net2.model.add_module('fc',Dense(2,activation=None))
net2.model.add_module('softmax',SoftMax(-1,add_noise=True,noise_intensity=0.12))
net2.summary()


In [None]:
net3=resnet.ResNet50(pretrained=True,include_top=False,freeze_features=True,input_shape=(3,96,96),classes=2)
net3.model.remove_at(-1)
net3.model.add_module('custom',CustomCropFlatten())
net3.model.add_module('fc',Dense(2,activation=None))
net3.model.add_module('softmax',SoftMax(axis=-1,add_noise=True,noise_intensity=0.12))
net3.summary()

In [None]:
from sklearn.metrics import roc_curve, auc, roc_auc_score
# make a prediction

def auc(output,target):
    
    output_np=to_numpy(exp(output))[:,1]
    target_np=to_numpy(target)
    return roc_auc_score(target_np, output_np)



In [None]:
def punishment(output,target):
    mask=target==1
    mask_neg=target==0
    output=where(is_abnormal_number(output),zeros_like(output),output.copy())#如果出現異常值，則補零
    masked_positive=exp(output)[:,1][mask]
    masked_negative=exp(output)[:,1][mask_neg]
    return 1-clip(masked_positive.mean()-masked_negative.mean(),0.0,1.0)
    

在這邊我們的cross entropy加上了auto_balace的選項，這是我的api一個比較獨特的功能，它會自動計算個標籤的樣本分布(就像之前介紹過的label statistics)，然後根據特樣本數量差異，進行自動的平衡權重計算，自動讓所有類別的影響力均等。至於label smooth則是要解決答案非1即0的問題，隨機把1變成0.9~1這樣有助於緩解相似圖像卻必須被softmax分很開的限制。  

此外我們還加入了F1ScoreLoss，從公式上看來它融合了準確率與召回率，其中的beta引數，當beta>1，則模型會更關注召回率，若是小於1則更關注準確率，所以我打算先不動這個數字讓他跑一下，到時看正確率與召回率數字再決定如何調整它。

    f1 score = (1 + beta ** 2) * precision * recall / (beta ** 2 * precision + recall)


In [None]:
#challenger1 使用DiffGrad優化器、累積梯度
net1.with_optimizer(AdaBelief,lr=2e-3,gradient_centralization='all')\
.with_loss(CrossEntropyLoss(auto_balance=True, label_smooth=True))\
.with_loss(F1ScoreLoss,loss_weight=0.5)\
.with_loss(punishment,loss_weight=0.01)\
.with_metric(accuracy,ignore_index=0)\
.with_metric(recall,ignore_index=0)\
.with_metric(auc)\
.with_regularizer('l2',1e-3)\
.with_model_save_path('./Models/eff_1.pth')\
.with_callbacks(CosineLR(max_lr=2e-3, min_lr=1e-7,period=5000))\
.unfreeze_model_scheduling(300,unit='batch',module_name='block7a')\


#challenger1 使用DiffGrad優化器、累積梯度
net2.with_optimizer(AdaBelief,lr=2e-3,gradient_centralization='all')\
.with_loss(CrossEntropyLoss(auto_balance=True, label_smooth=True))\
.with_loss(F1ScoreLoss,loss_weight=0.5)\
.with_loss(punishment,loss_weight=0.01)\
.with_metric(accuracy,ignore_index=0)\
.with_metric(recall,ignore_index=0)\
.with_metric(auc)\
.with_regularizer('l2',1e-3)\
.with_model_save_path('./Models/eff_2.pth')\
.with_callbacks(CosineLR(max_lr=2e-3, min_lr=1e-7,period=5000))\
.unfreeze_model_scheduling(300,unit='batch',module_name='block5c')



#challenger1 使用DiffGrad優化器、累積梯度
net3.with_optimizer(AdaBelief,lr=2e-3,gradient_centralization='all')\
.with_loss(CrossEntropyLoss(auto_balance=True, label_smooth=True))\
.with_loss(F1ScoreLoss,loss_weight=0.5)\
.with_loss(punishment,loss_weight=0.01)\
.with_metric(accuracy,ignore_index=0)\
.with_metric(recall,ignore_index=0)\
.with_metric(auc)\
.with_regularizer('l2',1e-3)\
.with_model_save_path('./Models/resnet_1.pth')\
.with_callbacks(CosineLR(max_lr=2e-3, min_lr=1e-7,period=5000))\
.unfreeze_model_scheduling(300,unit='batch',module_name='layer3.5')

In [None]:

#使用outsample驗證
def draw_roc(training_context): #建模訓練階段都靠training_context通訊，傳遞所有需要的訊息
    if training_context['steps']==10 or (training_context['steps']+1)%100==0:
        model=training_context['current_model']
        model.eval()  #切換為推論模式
        traindata=training_context['train_data'] #取出測試數據
        input_data=traindata['image'].copy()#複製一份避免與損失函數計算時干擾
        target_data=traindata['label'].copy() #複製一份避免與損失函數計算時干擾
        
       
        if not any_abnormal_number(input_data):#確認輸入數據沒有nan與inf
            target_np=to_numpy(target_data)
            
            output=model(input_data)
            output=where(is_abnormal_number(output),zeros_like(output),output.copy())#如果出現異常值，則補零
            output_np=to_numpy(output) #推論階段softmax就會是一般softmax(訓練階段是log_softmax)，所以不必再做處理，而且所有noise, dropout都會消失
            model.train()#計算完切回訓練模式
            fpr, tpr,_=roc_curve(target_np, output_np[:,1])
            if not any_abnormal_number(fpr) and not any_abnormal_number(tpr):
                plt.figure(1)
                plt.plot([0, 1], [0, 1], 'k--')
                plt.plot(fpr, tpr, label='area = {:.3f}'.format(roc_auc_score(target_np, output_np[:,1])))
                plt.xlabel('False positive rate')
                plt.ylabel('True positive rate')
                plt.title('ROC curve')
                plt.legend(loc='best')
                plt.show()
        
net1.trigger_when('on_batch_end', frequency=1, action=draw_roc)
net2.trigger_when('on_batch_end', frequency=1, action=draw_roc)
net3.trigger_when('on_batch_end', frequency=1, action=draw_roc)

In [None]:
plan=TrainingPlan()\
    .add_training_item(net1,name='net1')\
    .add_training_item(net2,name='net2')\
    .add_training_item(net3,name='net3')\
    .with_data_loader(data_provider)\
    .with_batch_size(256)\
    .repeat_epochs(1)\
    .out_sample_evaluation_scheduling(100)\
    .print_gradients_scheduling(100,unit='batch')\
    .print_progress_scheduling(10,unit='batch')\
    .display_loss_metric_curve_scheduling(200)\
    .save_model_scheduling(20,unit='batch')

In [None]:
plan.start_now(collect_data_inteval=10)

然後我們示範一下如何產出要提交的內容，我們做兩個示範，第一種是我們根據效度指標看起來最高的模型為基礎，第二種是基於幾個模型的綜合評比

In [None]:
for dirname, _, filenames in os.walk('.'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

#取回你的模型的最簡單方法
from IPython.display import FileLink
FileLink('./Models/net3.pth')

In [None]:
f=open('../input/histopathologic-cancer-detection/sample_submission.csv','r',encoding='utf-8-sig')
rows=f.readlines()
print(rows[:5])

In [None]:
from tqdm import tqdm
summit_imgs=glob.glob('../input/histopathologic-cancer-detection/test/*.tif')
best_model=net3
#這句超極重要，忘記了就一切白做了
best_model.eval()
best_model.class_names=[]
best_model.preprocess_flow=[Normalize(127.5,127.5)]

results=OrderedDict()
submission_rows=[]
submission_rows.append('id,label\n')

for i in tqdm(range(len(summit_imgs))):
    summit_key=summit_imgs[i].split('/')[-1].replace('.tif','')#從圖檔位置取出圖檔編號
    infer_results=best_model.infer_single_image(summit_imgs[i])#冒號是指取所有批次軸，1是指取陽性的機率  
    results[summit_key]=infer_results[1].item() #以key-value形式寫入
    submission_rows.append('{0},{1}\n'.format(summit_key,results[summit_key]))
    if len(submission_rows)<=10:
        print('submission_rows',submission_rows[-1],len(submission_rows))
    
   

print(len(results),len(submission_rows)-1)#比一下兩邊數字應該一樣

with open('results/submission.csv','w',encoding='utf-8-sig') as f:
    f.writelines(submission_rows)

fr=open('results/submission.csv','r',encoding='utf-8-sig')
rows=fr.readlines()
print(rows[:3])

In [None]:


#這句超極重要，忘記了就一切白做了
net1.eval()
net2.eval()
net3.eval()
net1.class_names=[]
net2.class_names=[]
net3.class_names=[]
net1.preprocess_flow=[Normalize(127.5,127.5)]
net2.preprocess_flow=[Normalize(127.5,127.5)]
net3.preprocess_flow=[Normalize(127.5,127.5)]


results=OrderedDict()
submission_rows=[]
submission_rows.append('id,label\n')

for i in tqdm(range(len(summit_imgs))):
    summit_key=summit_imgs[i].split('/')[-1].replace('.tif','')#從圖檔位置取出圖檔編號
    infer_results=(net1.infer_single_image(summit_imgs[i])[1]+net2.infer_single_image(summit_imgs[i])[1]+net3.infer_single_image(summit_imgs[i])[1])/3.0#冒號是指取所有批次軸，1是指取陽性的機率  
    results[summit_key]=infer_results.item() #以key-value形式寫入
    submission_rows.append('{0},{1}\n'.format(summit_key,results[summit_key]))
    if len(submission_rows)<=10:
        print('submission_rows',submission_rows[-1],len(submission_rows))


print(len(results),len(submission_rows)-1)#比一下兩邊數字應該一樣

with open('results/submission1.csv','w',encoding='utf-8-sig') as f:
    f.writelines(submission_rows)

fr=open('results/submission1.csv','r',encoding='utf-8-sig')
rows=fr.readlines()
print(rows[:3])

In [None]:
api_token= {"username":"your_username","key":"your_token"} #請換成你自己的kaggle認證#請換成你自己的kaggle認證
import json
import zipfile
import os
 
if not os.path.exists("/root/.kaggle"):
    os.makedirs("/root/.kaggle")
 
with open('/root/.kaggle/kaggle.json', 'w') as file:
    json.dump(api_token, file)
!chmod 600 /root/.kaggle/kaggle.json
 
if not os.path.exists("/kaggle"):
    os.makedirs("/kaggle")
!kaggle competitions submit -c histopathologic-cancer-detection -f 'results/submission1.csv' -m 'Resnet50+/EfficientNetB0 ensembles custom layer for 32x32 handling'