# 熟悉Kaggle Kernel環境
* 這個環境是kaggle/python Docker image建起來的，有很多packages
* RAM, DISK, GPU
* paths

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        break
    break
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os

# where am I?
print(os.getcwd())
print(os.listdir("/kaggle/input/shopee-code-league-20/_DA_Product_Detection"))

# Product Detection

# Problem Definition

https://www.kaggle.com/davydev/shopee-code-league-20/tasks?taskId=1550

* problem description
* submission example
* metrics

# Import Packages

In [None]:
import cv2
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import random
import torch
import torch.nn.functional as F

from PIL import Image
from torch import nn
from torch import optim
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets, transforms, models
from tqdm import tqdm

# Set Seeds

In [None]:
SEED = 5566
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

# Load data
* load csv
* dataset
* dataloader

## Load csv

In [None]:
DATA_PATH = "/kaggle/input/shopee-code-league-20/_DA_Product_Detection"

TRAIN_CSV_PATH = os.path.join(DATA_PATH, "train.csv")
TEST_CSV_PATH = os.path.join(DATA_PATH, "test.csv")

TRAIN_PATH = os.path.join(DATA_PATH, "train", "train")
TEST_PATH = os.path.join(DATA_PATH, "test", "test")

train_df = pd.read_csv(TRAIN_CSV_PATH)
test_df = pd.read_csv(TEST_CSV_PATH)

display(train_df.head())
display(test_df.head())

In [None]:
print(sorted(train_df.category.unique()))
print(test_df.category.unique())

In [None]:
train_str_category = train_df.category.apply(lambda x: str(x).zfill(2))
train_df['file_path'] = train_str_category + os.sep + train_df.filename

test_str_category = test_df.category.apply(lambda x: str(x).zfill(2))
test_df['file_path'] = test_df.filename

display(train_df)
display(test_df)

In [None]:
train_sample_path = os.path.join(TRAIN_PATH, train_df.iloc[0].file_path)
Image.open(train_sample_path)

In [None]:
test_sample_path = os.path.join(TEST_PATH, test_df.iloc[0].file_path)
Image.open(test_sample_path)

## Dataset

* transforms
* 使用ImageFolder, https://pytorch.org/vision/0.8/_modules/torchvision/datasets/folder.html#ImageFolder

In [None]:
train_transforms = transforms.Compose([transforms.Resize((224,224)),
                                       transforms.ToTensor(),
                                       ])

test_transforms = transforms.Compose([transforms.Resize((224,224)),
                                      transforms.ToTensor(),
                                      ])

In [None]:
train_set = datasets.ImageFolder(TRAIN_PATH,       
                    transform=train_transforms)

In [None]:
image, label = next(iter(train_set))
print("image", image)
print("Label", label)
print("Len", len(train_set))
print("train_data.classes", train_set.classes)
print("train_data.class_to_idx", train_set.class_to_idx)

In [None]:
split_idx = int(train_df.shape[0] / 5 * 4)

train_set, val_set = torch.utils.data.random_split(train_set, [split_idx, train_df.shape[0] - split_idx])

test_set = datasets.ImageFolder(os.path.join(TEST_PATH, ".."),       
                    transform=test_transforms)

In [None]:
image, label = next(iter(train_set))
print("image", image)
print("Label", label)
print("Len", len(train_set))

In [None]:
image, label = next(iter(val_set))
print("image", image)
print("Label", label)
print("Len", len(val_set))

In [None]:
image, label = next(iter(test_set))
print("image", image)
print("Label", label)
print("Len", len(test_set))
print("test_set.classes", test_set.classes)
print("test_set.class_to_idx", test_set.class_to_idx)

# DataLoader

In [None]:
train_loader = torch.utils.data.DataLoader(train_set, batch_size=64, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_set, batch_size=64)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=1)

images, labels = next(iter(train_loader))
print(images.shape)
print(labels.shape)

images, labels = next(iter(val_loader))
print(images.shape)
print(labels.shape)

images, labels = next(iter(test_loader))
print(images.shape)
print(labels.shape)

# Explore Datasets
* 看看資料長什麼樣子，有哪些種類
* 每個種類的分佈是不是一樣？

In [None]:
def plot_samples(df, data_root_path, category, num_imgs=20, num_cols=5):
    
    paths = df[df.category == category].file_path.sample(num_imgs)
    
    num_rows = num_imgs // num_cols 
    if num_imgs % num_cols != 0:
        num_rows += 1
    
    fig = plt.figure(figsize=(num_cols * 4, num_rows * 4))
    for idx, path in enumerate(paths):

        ax = fig.add_subplot(num_rows,num_cols,idx+1)
        
        im = cv2.imread(os.path.join(data_root_path, path))
        im_resized = cv2.resize(im, (224, 224), interpolation=cv2.INTER_LINEAR)

        plt.imshow(cv2.cvtColor(im_resized, cv2.COLOR_BGR2RGB))
        plt.axis('off')
        plt.tight_layout()
        
    plt.show()

In [None]:
for category in sorted(train_df.category.unique()):
    print(category)
    plot_samples(train_df, TRAIN_PATH, category, 5, 5)

In [None]:
for category in sorted(test_df.category.unique()):
    print(category)
    plot_samples(test_df, TEST_PATH, category, 5, 5)

In [None]:
import matplotlib.pyplot as plt
plt.figure()
plt.hist(train_df.category, bins=len(train_df.category.unique()))
plt.show()

print(train_df.category.value_counts().sort_index())

# baseline模型
* 建立一個簡單的模型做為baseline

In [None]:
def get_model(device):

    model = models.resnet50(pretrained=True)
    
    for param in model.parameters():
        param.requires_grad = False
    
    model.fc = nn.Sequential(nn.Linear(2048, 512),
                                     nn.ReLU(),
                                     nn.Dropout(0.2),
                                     nn.Linear(512, 42),
                                     nn.LogSoftmax(dim=1))
    
    model.to(device)
    
    return model

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = get_model(device)
criterion = nn.NLLLoss()
#criterion = nn.CrossEntropyLoss
optimizer = optim.Adam(model.fc.parameters(), lr=0.003)
print(model)
print(device)

# Training
* Parameters
* Training model

In [None]:
num_epochs = 51
loss_interval = 100
LOAD_MODEL = True

if LOAD_MODEL:
    LOAD_EPOCH = 10
    checkpoint_path = "../input/model-10/model_{}.pth".format(LOAD_EPOCH)
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epochs = range(checkpoint['epoch'] + 1, num_epochs) # start from new epoch
    
else:
    epochs = range(num_epochs)


model.train()
for epoch in tqdm(epochs):
    running_loss = 0.0
    
    for idx, (inputs, labels) in enumerate(train_loader):
        
        inputs, labels = inputs.to(device),labels.to(device)
        optimizer.zero_grad()
        output = model(inputs)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
          
        if idx % loss_interval == loss_interval - 1:
            print("loss: {}".format(running_loss / loss_interval))
            running_loss = 0.0
            
    if epoch % 10 == 0:

        torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': loss,
            }, "model_{}.pth".format(epoch))

# Testing

In [None]:
test_epoch = 50
checkpoint_path = "../input/model-10/model_{}.pth".format(test_epoch)
checkpoint = torch.load(checkpoint_path)

model = get_model(device)
model.load_state_dict(checkpoint['model_state_dict'])

NUM_CLASSES = len(train_df.category.unique())

class_correct = list(0. for i in range(NUM_CLASSES))
class_total = list(0. for i in range(NUM_CLASSES))

correct = 0
total = 0

model.eval()
with torch.no_grad():
    for inputs, labels in tqdm(val_loader):
        
        inputs, labels = inputs.to(device),labels.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        
        c = (predicted == labels).squeeze()
        
        for i in range(labels.size(0)):
            label = labels[i]
            class_correct[label] += c[i].item()
            class_total[label] += 1
    

In [None]:
print('Accuracy: {}'.format(100 * correct / total))

for i in range(NUM_CLASSES):
    print('Accuracy of class {}: {}'.format(i, 100 * class_correct[i] / class_total[i]))

# Get prediction

In [None]:
test_loader.dataset.samples[0]

In [None]:
NUM_CLASSES = len(train_df.category.unique())

class_correct = list(0. for i in range(NUM_CLASSES))
class_total = list(0. for i in range(NUM_CLASSES))

correct = 0
total = 0

row_format = "{}, {}\n"
header = row_format.format("filename", "category")

model.eval()
with open("submission.csv", "w") as f_csv:
    
    f_csv.write(header)
    
    with torch.no_grad():
        for idx, (inputs, _) in enumerate(tqdm(test_loader)):
            
            inputs = inputs.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)

            filename, _ = test_loader.dataset.samples[idx]

            row = row_format.format(os.path.basename(filename), predicted.item()) 
            f_csv.write(row)