In [None]:
!ls ../input/planet-understanding-the-amazon-from-space -l

In [None]:
!ls ../input/ -lh

In [None]:
import numpy as np
import pandas as pd
import os
import gc
import glob

import matplotlib.image as mpimg
from tqdm import tqdm_notebook

In [None]:
!ls ../input/ -lh

# 数据读取

In [None]:
df_train = pd.read_csv('../input/train_v2.csv')
df_train['image_name'] = '../input/train-jpg/' + df_train['image_name'] + '.jpg'
df_train.head()

In [None]:
df_test = pd.read_csv('../input/sample_submission_v2.csv')
df_test['image_name'] = '../input/test-jpg/' + df_test['image_name'] + '.jpg'
df_test['tags'] = df_train['tags'].apply(lambda x: x.split(' '))


In [None]:
df_train['tags2'] = df_train['tags'].apply(lambda x: x.split(' '))
labels_list = sum(list(df_train['tags2'].values), [])
labels = set(labels_list)

label_map = {l: i for i, l in enumerate(labels)}
inv_label_map = {i: l for l, i in label_map.items()}

In [None]:
label_map

In [None]:
%pylab inline
import seaborn as sns

labels_s = pd.Series(labels_list).value_counts() # To sort them by count
fig, ax = plt.subplots(figsize=(16, 8))
sns.barplot(x=labels_s, y=labels_s.index, orient='h')

In [None]:
images_title = [df_train[df_train['tags'].str.contains(label)].iloc[i]['image_name']
                for i, label in enumerate(labels)]

plt.rc('axes', grid=False)
_, axs = plt.subplots(5, 4, sharex='col', sharey='row', figsize=(15, 20))
axs = axs.ravel()

for i, (image_name, label) in enumerate(zip(images_title, labels)):
    img = mpimg.imread(image_name)
    axs[i].imshow(img)
    axs[i].set_title('{}'.format(label))

In [None]:
# df_train = df_train.iloc[:3000]

# 数据划分

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
label_encoder = MultiLabelBinarizer()
Y = label_encoder.fit_transform(df_train['tags2']).astype(float)

In [None]:
# https://github.com/trent-b/iterative-stratification
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=0)

for train_index, test_index in mskf.split(df_train['image_name'].values, Y):
    # print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = df_train['image_name'].iloc[train_index].values, df_train['image_name'].iloc[test_index].values
    y_train, y_test = Y[train_index], Y[test_index]
    
    print(y_train.sum(0))
    print(y_test.sum(0))
    print('')

# 定义dataset

In [None]:
import torchvision.datasets as datasets
from torch.utils.data.dataset import Dataset
from PIL import Image
import torch
from torch import nn

from efficientnet_pytorch import EfficientNet

class PlantDataset(Dataset):
    def __init__(self, path, label, transform=None):
        self.path = path
        self.label = label
        if transform is not None:
            self.transform = transform
        else:
            self.transform = None
    
    def __getitem__(self, index):
        img = Image.open(self.path[index]).convert('RGB')

        if self.transform is not None:
            img = self.transform(img)
        return img, torch.from_numpy(np.array(self.label[index]))
    
    def __len__(self):
        return len(self.path)
    
class PlantNet(nn.Module):
    def __init__(self):
        super(PlantNet, self).__init__()
                
#         model = models.resnet50(True)
#         model.avgpool = nn.AdaptiveAvgPool2d(1)
#         model.fc = nn.Linear(2048, 100)
#         self.resnet = model

        model = EfficientNet.from_pretrained('efficientnet-b5') 
        model._fc = nn.Linear(2048, 17)
        self.resnet = model
        
    def forward(self, img):        
        out = self.resnet(img)
        return out

In [None]:
# 训练
def train(model, device, train_loader, optimizer, criterion, epoch):
    model.train() # 转为训练，drop起作用
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad() # 清空梯度
        output = model(data) # 正向传播
        loss = criterion(output, target) # 计算损失
        loss.backward() # 梯度计算
        optimizer.step() # 参数更新
        if batch_idx % 10 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))

# 测试
def test(model, device, test_loader, criterion):
    model.eval() 
    model_predict, model_target = [], []
    test_loss = 0
    with torch.no_grad():
        for data, target in tqdm_notebook(test_loader):
            data, target = data.to(device), target.to(device)
            output = model(data)
            
            model_predict.append(output.sigmoid().data.cpu().numpy())
            model_target.append(target.data.cpu().numpy())
            
            test_loss += criterion(output, target).item()  # sum up batch loss

    test_loss /= len(test_loader.dataset)
    
    f2_socre = []
    model_predict = np.vstack(model_predict)
    model_target = np.vstack(model_target)
    
    for idx in range(len(model_predict)):
        f2_socre.append(fbeta_score(model_predict[idx,:]>0.5, model_target[idx,:], beta=2))
    f2_socre = np.mean(f2_socre)
    
    print('\nTest set: Average loss: {:.4f} F2 Score: {:.4f}\n'.format(test_loss, f2_socre))
    
    return test_loss, f2_socre, model_predict

In [None]:
import torchvision.transforms as transforms

train_dataset = PlantDataset(X_train, y_train,
                            transforms.Compose([
                            transforms.Resize((256, 256)),
                            transforms.ColorJitter(hue=.05, saturation=.05),
                            transforms.RandomHorizontalFlip(),
                            transforms.RandomVerticalFlip(),
                            transforms.ToTensor(),
                            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]))
valid_dataset = PlantDataset(X_test, y_test,
                            transforms.Compose([
                            transforms.Resize((256, 256)),
                            transforms.ToTensor(),
                            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]))

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=20, shuffle=True)
test_loader = torch.utils.data.DataLoader(valid_dataset,  batch_size=20)

In [None]:
device = torch.device("cuda:0")
model = PlantNet().to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), 0.001)

In [None]:
from sklearn.metrics import fbeta_score

best_loss = 100
for epoch in range(1, 11):
    train(model, device, train_loader, optimizer, criterion, epoch)
    test_loss, f2_socre, _ = test(model, device, test_loader, criterion)
    if test_loss < best_loss:
        best_loss = test_loss
        torch.save(model.state_dict(), 'plant_net.pt')

# 提交结果

In [None]:
test_dataset = PlantDataset(df_test['image_name'].values, np.zeros((df_test.shape[0], 17)),
                            transforms.Compose([
                            transforms.Resize((256, 256)),
                            transforms.ToTensor(),
                            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]))

test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=40, shuffle=False)

In [None]:
model.load_state_dict(torch.load('plant_net.pt'))
test_loss, f2_socre, test_predict = test(model, device, test_loader, criterion)

In [None]:
pred_label = []
for pred in test_predict:
    pred_idx = np.where(pred > 0.5)[0]
    pred_label.append(' '.join(label_encoder.classes_[pred_idx]))

In [None]:
df_test = pd.read_csv('../input/sample_submission_v2.csv')
df_test['tags'] = pred_label

In [None]:
df_test.to_csv('tmp.csv', index=None)

# 阅读链接

- [1st interview](https://medium.com/kaggle-blog/planet-understanding-the-amazon-from-space-1st-place-winners-interview-bf66fb444bc2)

https://github.com/Cadene/pretrained-models.pytorch

https://github.com/lukemelas/EfficientNet-PyTorch