In [None]:
# 比赛 https://www.kaggle.com/c/ranzcr-clip-catheter-line-classification

# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = 'all'

import os
import sys
import gc
import math
import pickle
import random
import time
import psutil
import pytz
from datetime import datetime
from collections import defaultdict
from contextlib import contextmanager

import warnings
warnings.filterwarnings('ignore')  # warnings.filterwarnings(action='once')

from tqdm import tqdm, tqdm_notebook

import numpy as np
import pandas as pd
_ = np.seterr(divide='ignore', invalid='ignore')

pd.set_option('display.max_columns', None)
# pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', None)
# pd.set_option('display.max_rows', 100)

import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.style as style
style.use('fivethirtyeight')
import seaborn as sns

# 直接在cell中显示图片，支持jpg、png、jpeg等格式，Image('./2.JPG')
from IPython.display import Image  

import lightgbm as lgb
from sklearn.metrics import roc_auc_score

import tensorflow as tf
from tensorflow import keras

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

def show_process_mem_usage(info_str=''):    ## 显示当前进程占用内存大小
    process = psutil.Process(os.getpid())
    memory_usage = process.memory_info().rss
    percent = psutil.virtual_memory().percent
    
    tz = pytz.timezone('Asia/Shanghai')
    now = datetime.now(tz)
    dt_str = now.strftime("%Y-%m-%d %H:%M:%S")
    
    if memory_usage >= 2.**30:
        print(f'{info_str} current process memory usage: {memory_usage/2.**30:.3f} GB, percentage: {percent:.2f}% 【{dt_str}】')
    elif memory_usage >= 2.**20:
        print(f'{info_str} current process memory usage: {memory_usage/2.**20:.3f} MB, percentage: {percent:.2f}% 【{dt_str}】')
    elif memory_usage >= 2.**10:
        print(f'{info_str} current process memory usage: {memory_usage/2.**10:.3f} KB, percentage: {percent:.2f}% 【{dt_str}】')
    else:
        print(f'{info_str} current process memory usage: {memory_usage} B, percentage: {percent:.2f}% 【{dt_str}】')

def logging(*info, file_name='./running_log.txt'):
    log_info = ' '.join([str(s) for s in info])
    with open(file_name, 'a') as f:
        f.write(log_info + '\n')

@contextmanager
def trace(trace_msg):    ## 追踪内存变化和运行时间
    t0 = time.time()
    p = psutil.Process(os.getpid())
    m0 = p.memory_info()[0] / 2. ** 30
    yield
    m1 = p.memory_info()[0] / 2. ** 30
    delta = m1 - m0
    sign = '+' if delta >= 0 else '-'
    delta = math.fabs(delta)
    trace_msg = str(trace_msg)
    
    tz = pytz.timezone('Asia/Shanghai')
    now = datetime.now(tz)
    dt_str = now.strftime("%Y-%m-%d %H:%M:%S")
    print(f"[{m1:.3f}GB({sign}{delta:.3f}GB):{time.time() - t0:.3f}sec] {trace_msg} 【{dt_str}】", file=sys.stdout)
    
def seed_all(random_seed=42):
    os.environ['PYTHONHASHSEED'] = str(random_seed)
    random.seed(random_seed)
    np.random.seed(random_seed)
    tf.random.set_seed(random_seed)
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.backends.cudnn.deterministic = True

def keepbusy(num=10000):
    start_t = time.time()
    for i in range(num):
        ftpt(f'i: {i}, taken time: {time.time() - start_t:.7f}')
        time.sleep(60)

def ftpt(msg = 'having run this cell'):  # foot_print   
    tz = pytz.timezone('Asia/Shanghai')
    now = datetime.now(tz)
    dt_string = now.strftime("%Y-%m-%d %H:%M:%S")
    print(f'{dt_string}: {msg}')
    
# 质数列表  [7, 53, 97, 317, 577, 997, 7753, 9973, 53113, 99991, 153133, 377171, 515371, 737353, 999983, 5157133, 7757537, 9999991, 99999989, 999999937]
RANDOM_SEED = 53113
seed_all(RANDOM_SEED)

dataset_path = '../input/ranzcr-clip-catheter-line-classification'
os.listdir(dataset_path)
global_start_t = time.time()

ftpt()

In [None]:
TRAIN_PATH = '../input/ranzcr-clip-catheter-line-classification/train/'
file_names = os.listdir(TRAIN_PATH)
len(file_names)
file_names[:5]

In [None]:
# import cv2

# total_t_1, WBB_t = 0, 0
# for i, file_name in enumerate(file_names):
#     file_path = TRAIN_PATH + file_name

#     start_t_1 = time.time()
#     image = cv2.imread(file_path)
#     total_t_1 += time.time() - start_t_1
    
#     image = cv2.imread(file_path, cv2.IMREAD_GRAYSCALE)
#     start_t_3 = time.time()
#     mask = image > 0
#     image = image[np.ix_(mask.any(1), mask.any(0))]
#     WBB_t += time.time() - start_t_3
    
#     if i%200==0:
#         print('file_name: ', file_name)
#         print(f'total_t_1: {total_t_1:.5f}, WBB_t: {WBB_t:.5f}')
    
# print(f'Final total_t_1: {total_t_1:.5f}, WBB_t: {WBB_t:.5f}')

############################################################################################

# file_name:  1.2.826.0.1.3680043.8.498.77518942369171843789006015594910539466.jpg
# total_t_1: 92.87874, WBB_t: 62.81078
# file_name:  1.2.826.0.1.3680043.8.498.73171729950725506271487678682474036147.jpg
# total_t_1: 103.78828, WBB_t: 69.73897

In [None]:
import cv2

OUTPUT_DIR = './train_data_wbb/'

if os.path.exists(OUTPUT_DIR):
    os.system('rm -rf ./train_data_wbb')

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
    
ftpt()

In [None]:
cnt = 0
start_t = time.time()
for i, file_name in enumerate(file_names):
    file_path = TRAIN_PATH + file_name

    image1 = cv2.imread(file_path, cv2.IMREAD_GRAYSCALE)
#     image2 = cv2.imread(file_path, cv2.IMREAD_GRAYSCALE)
    mask = image1 > 0
    image2 = image1[np.ix_(mask.any(1), mask.any(0))]
    
    if image1.shape != image2.shape:
        cnt += 1
        target_file_path = OUTPUT_DIR + file_name
        cv2.imwrite(target_file_path, image2) 
        
    if i%500==0:
        print(f'current i: {i}, cnt: {cnt}')
    
ftpt(f'cnt: {cnt} total cost time: {time.time()-start_t:.3f}')

In [None]:
test = pd.read_csv('../input/ranzcr-clip-catheter-line-classification/sample_submission.csv')
target_cols = test.iloc[:, 1:12].columns.tolist()
test[target_cols] = 0
test[['StudyInstanceUID'] + target_cols].to_csv('./submission.csv', index=False)
print('test.head() is ', test.head())

print(f'finished total cost time: {time.time()-global_start_t:.5f}')