In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv("../input/ranzcr-clip-catheter-line-classification/train.csv")

LABELS = [
    'ETT - Abnormal', 'ETT - Borderline', 'ETT - Normal',
    'NGT - Abnormal', 'NGT - Borderline', 'NGT - Incompletely Imaged', 'NGT - Normal', 
    'CVC - Abnormal', 'CVC - Borderline', 'CVC - Normal',
    'Swan Ganz Catheter Present'
]

In [None]:
df = pd.read_csv("../input/ranzcr-clip-catheter-line-classification/train.csv")
df.head()

In [None]:
DEBUG = True
if DEBUG:
    df = df.sample(frac = 0.01).reset_index(drop = True)
    print(df.shape)

In [None]:
from sklearn.model_selection import train_test_split
train, valid =train_test_split(df ,test_size =0.1)
train.head()

In [None]:
print(train.shape,valid.shape)

In [None]:
path= train.iloc[0 ,0]
path

In [None]:
path = "../input/ranzcr-clip-catheter-line-classification/train" + "/" + path + ".jpg"
path

In [None]:
import cv2 
image= cv2.imread(path)
image.shape

In [None]:
import matplotlib.pyplot as plt
plt.imshow(image)
plt.show()

In [None]:
image = cv2.cvtColor(image,cv2.COLOR_BGR2RGB)
plt.imshow(image)
plt.show()

In [None]:
path = train.iloc[1, 0]
path = "../input/ranzcr-clip-catheter-line-classification/train" + "/" + path + ".jpg"
image2 = cv2.imread(path)
image2.shape

In [None]:
plt.imshow(image2)
plt.show()

In [None]:
from albumentations import Resize
dummy = Resize(width = 300, height = 300)(image = image)
dummy

In [None]:
image = Resize(width = 300, height = 300)(image = image)["image"]
image.shape

In [None]:
from albumentations.pytorch import ToTensorV2
image = ToTensorV2()(image = image)["image"]
image.shape

In [None]:


from torch.utils.data import Dataset

class TrainDataset(Dataset):
    def __init__(self, df):
        self.df = df
        self.studyuid = df["StudyInstanceUID"].values
        self.labels = df[LABELS].values
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        path = self.studyuid[idx]
        path = "../input/ranzcr-clip-catheter-line-classification/train" + "/" + path + ".jpg"
        image = cv2.imread(path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = Resize(300, 300)(image = image)["image"]
        image = ToTensorV2()(image = image)["image"]
        labels = self.labels[idx]
        return image, labels



The dataset is created in the class.

init: Initialization condition. The argument is a data frame such as train. Since self is essential, let's write it for the time being.

len: Required to define the data size. It is basically the number of rows of data passed at initialization.

getitem: Required when retrieving data. index is the argument.

When retrieving data, the index becomes an argument, so for example, when 0 is entered, the first path of studyuid will be the target.

After that, the same process as before is executed and the image data is output as image and the corresponding label (correct answer) is output as labels.****

In [None]:
train_dataset = TrainDataset(train)
train_dataset[0]


When you create a dataset, you pass in a pandas dataframe. This is the init argument.

Let's actually pass 0 and see the first data.

Image data is output first, and label data is output next.

In [None]:
image, label = train_dataset[0]
plt.imshow(image.permute(1, 2, 0))
plt.show()
print(label)


In this way, we have created a system that retrieves images and labels using only indexes.

4. DataLoader


Put the created dataset in the data loader.

In [None]:
from torch.utils.data import DataLoader

Import the data loader

In [None]:
train_loader = DataLoader(train_dataset, batch_size = 8, shuffle = True)


batch_size: How many sheets to take out at one time. The more it is, the faster it learns, but it uses memory. The smaller it is, the more memory is suppressed, but it takes longer to learn, and it is greatly affected by the characteristics of one sheet.

shuffle: Take out in random order.

There are other things such as drop_last, so please check them if you want to learn in earnest.

In [None]:
for batch in train_loader:
    print(batch[0].shape)


All data can be retrieved with the for statement.

Since batch_size is set to 8, 8 sheets of data are output at once.

In [None]:
valid_dataset = TrainDataset(valid)
valid_loader = DataLoader(valid_dataset, batch_size = 16, shuffle = False)

5. Modeling

The model uses EfficientNet.

In [None]:
import sys
sys.path.append('../input/pytorch-image-models/pytorch-image-models-master')
import timm
from pprint import pprint
pprint(timm.list_models(pretrained = True))

There are several ways to use EfficientNet.

This time I used a set of image classification models called timm. Since it is uploaded to the Dataset, let's put it in the input from "+ Add data".

There is also a way to install it with pip install, but in this competition you can not use it because you can not connect to the net at the time of submission.

In [None]:
import torch.nn as nn

class Effnet(nn.Module):
    def __init__(self):
        super().__init__()
        self.effnet = timm.create_model(model_name = "tf_efficientnet_b0", pretrained = False)
        n_features = self.effnet.classifier.in_features
        self.effnet.classifier = nn.Linear(n_features, len(LABELS))
    
    def forward(self, x):
        x = self.effnet(x)
        return x


Create a class using Module in torch.nn.

Since super and init are fixed phrases, let's write them without worrying about them.

Create EfficientNet with timing.create_model. Select the model name to specify from the list output earlier.

EfficientNet has B0 to B7, and this time it is B0.

If pretrained = True, it will be a trained model, but it cannot be used with net OFF because parameters need to be downloaded from the net.

I want to change the final output format, so replace the .classifier part with Linear (fully connected layer).

Since the input size at this time is required, let's get it as n_features. The output size is the number of LABELS you want to predict.

forward is a function for actually learning (predicting). Returns the result of passing through EfficientNet with the input as x.

In [None]:
model = Effnet()
model

In [None]:
import torch
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

Specify whether to use CPU or GPU for calculation.

You can turn on the GPU from the "setting" on the far right. (Currently 43 hours a week free)

If it is ON, torch.cuda.is_available will be True, so DEVICE will be cuda (GPU type). If False, it remains the CPU.

In [None]:
model = model.to(DEVICE)
print(DEVICE)

6. Learning
The steps to learn are as follows.

・ Determine the loss function

・ Determine the optimizer

・ Train with train_loader

-Check the performance with valid_loader

In [None]:
criterion = nn.BCEWithLogitsLoss()

It is a loss function. After scaling the output result to the range of 0 to 1 (sigmoid function), the error from the prediction is calculated.

In [None]:
optimizer = torch.optim.Adam(model.parameters())

It is an optimization method. There are various things, but I chose Adam, which is a major one.

Let's pass the parameters of the model we made earlier

In [None]:
model.train()
for X, y in train_loader:
    optimizer.zero_grad()
    X = X.float().to(DEVICE)
    y = y.float().to(DEVICE)
    pred = model(X)
    loss = criterion(pred, y)
    loss.backward()
    optimizer.step()

First, put it in learning mode with .train. I'm not sure what I'm doing.

Reset the optimizer once with .zero_grad before predicting.

The for statement pulls data from train_loader. I took it out as X and y.

When learning pytorch, it is necessary to make it a float type, so let's convert it with .float.

In addition, it is necessary to set CPU or GPU with to (DEVICE) even for the data to be included in the model. This is also easy to forget.

If you put X in model, it will be output as a prediction label through EfficientNet, so let's pass it to the loss function.

In [None]:
model.eval()
valid_loss = 0
with torch.no_grad():
    for X, y in valid_loader:
        X = X.float().to(DEVICE)
        y = y.float().to(DEVICE)
        pred = model(X)
        loss = criterion(pred, y)
        valid_loss += loss.item()
valid_loss /= len(valid_loader)
print("Loss:", valid_loss)

Check the performance with the evaluation data.

First change to evaluation mode with .eval. I don't know what this is doing either. .. ..

At the time of evaluation, I do not want to change the parameters of the model, so lock it with torch.no_grad.

Let's take out Xy and make it a float type and predict it in the same way as when learning.

Next we calculate the loss function, but this time we don't need to backward the error to the model.

Let's average the error in all batches. This is the performance in the first learning.

In [None]:
model = Effnet().to(DEVICE)

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters())

best_loss = np.inf
for epoch in range(10):
    model.train()
    for X, y in train_loader:
        optimizer.zero_grad()
        X = X.float().to(DEVICE)
        y = y.float().to(DEVICE)
        pred = model(X)
        loss = criterion(pred, y)
        loss.backward()
        optimizer.step()
    model.eval()
    valid_loss = 0
    with torch.no_grad():
        for X, y in valid_loader:
            X = X.float().to(DEVICE)
            y = y.float().to(DEVICE)
            pred = model(X)
            loss = criterion(pred, y)
            valid_loss += loss.item()
    valid_loss /= len(valid_loader)
    print(f"EPOCH:{epoch}, Loss:{valid_loss}")
    if valid_loss < best_loss:
        best_loss = valid_loss
        torch.save(model.state_dict(), "effnet.pth")
        print("saved...")

I want to recreate the model once, so I summarized what I have done so far.

Define the minimum error as best_loss. The start is endless.

If the error in the evaluation data is smaller than the minimum error so far, update it. Then save the model.

By doing this, the model with the smallest error will be overwritten by the end of all training.

This is the end of learning. What I introduced this time is at least what is necessary to build a model

7. Forecast

In [None]:
class TestDataset(Dataset):
    def __init__(self, df):
        self.df = df
        self.studyuid = df["StudyInstanceUID"].values
        
    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, idx):
        path = self.studyuid[idx]
        path = "../input/ranzcr-clip-catheter-line-classification/test" + "/" + path + ".jpg"
        image = cv2.imread(path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = Resize(300, 300)(image = image)["image"]
        image = ToTensorV2()(image = image)["image"]
        return image

Create a dataset for test. Almost the same as for learning.

Please note that the path is the path of test.

Also, since it does not have a correct label, the output is only image.

In [None]:
test = pd.read_csv("../input/ranzcr-clip-catheter-line-classification/sample_submission.csv")
test_dataset = TestDataset(test)
test_loader = DataLoader(test_dataset, batch_size = 32, shuffle = False)

You have defined a dataset and a data loader.

Mainly doing the same as valid_loader

In [None]:
model = Effnet().to(DEVICE)
model.load_state_dict(torch.load("./effnet.pth"))

In [None]:
submit_preds = []

model.eval()
with torch.no_grad():
    for X in test_loader:
        X = X.float().to(DEVICE)
        submit_preds.append(model(X).sigmoid().to("cpu"))
    submit_preds = np.concatenate([p.numpy() for p in submit_preds], axis = 0)


Load the data from test_loader and pass it to the model.

Let's scale the output value from 0 to 1 with .sigmoid.

If you do not make the data correspond to cpu, an error will occur later, so add to ("cpu").

Put the prediction result of each batch in the list (submit_preds), and finally join it in the row direction (axis = 0) with .concatenate of numpy.

You now have a forecast for submission.

In [None]:
submit = pd.DataFrame(submit_preds, columns = LABELS)
submit.head()

In [None]:
submit["StudyInstanceUID"] = test["StudyInstanceUID"]
submit = pd.concat([submit.iloc[:, -1], submit.iloc[:, :-1]], axis = 1)
submit.to_csv("submission.csv", index = False)