In [None]:
# import zipfile
# import glob

# zip_file = glob.glob('../input/dogs-vs-cats/*.zip')  
# # print(zip_file)

# def extract_zip(file):
#     with zipfile.ZipFile(file,"r") as zip_ref:
#         zip_ref.extractall("temp")
        
# for files in zip_file:
#     extract_zip(files)

In [None]:
!pip install efficientnet_pytorch

In [None]:
import os
import re
import numpy as np
from torch.utils.data import Dataset
from PIL import Image
import torch
import albumentations as A
from albumentations.pytorch import ToTensorV2
import torch.nn.functional as F
from torch import nn, optim
from torch.utils.data import DataLoader
from tqdm import tqdm
from efficientnet_pytorch import EfficientNet

import sklearn
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

In [None]:
class CatDog(Dataset):
    def __init__(self, root, transform=None):
        self.images = os.listdir(root)
        self.images.sort(key=lambda x: int(re.findall(r"\d+", x)[0]))
        self.root = root
        self.transform = transform

    def __len__(self):
        return len(self.images)

    def __getitem__(self, index):
        file = self.images[index]
        img = np.array(Image.open(os.path.join(self.root, file)))

        if self.transform is not None:
            img = self.transform(image=img)["image"]

        if "dog" in file:
            label = 1
        elif "cat" in file:
            label = 0
        else:
            label = -1

        return img, label

# Config 

In [None]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
NUM_WORKERS = 4
BATCH_SIZE = 5
PIN_MEMORY = True
LOAD_MODEL = True
SAVE_MODEL = True
CHECKPOINT_FILE = "b7.pth.tar"
WEIGHT_DECAY = 1e-4
LEARNING_RATE = 1e-4
NUM_EPOCHS = 1

# Augumentation
basic_transform = A.Compose(
    [
        A.Resize(height=448, width=448),
        A.Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225],
            max_pixel_value=255.0,
        ),
        ToTensorV2(),
    ]
)

Đây là một kĩ thuật cơ bản khác trong Transfer Learning là **Feature Extraction**. Mình sẽ extract các feature từ các pretrain model(ở đây là EfficientNetB7), sau đó cho các feature qua một classifier bất kì. 
Cách này chắc chắn sẽ nhanh hơn Fine Tuning vì phải không phải train thêm mạng CNN. Tuy nhiên trong nhiều trường hợp kết quả sẽ không bằng được Fine Tuning.
Mọi người cố gắng hiểu ý tưởng của kĩ thuật này để phân biệt với Fine Tuing.

In [None]:
def save_feature_vectors(model, loader, output_size=(1, 1), file="trainb7"):
    model.eval()
    images, labels = [], []

    for idx, (x, y) in enumerate(tqdm(loader)):
        x = x.to(DEVICE)

        with torch.no_grad():
            features = model.extract_features(x)
            features = F.adaptive_avg_pool2d(features, output_size=output_size)
        images.append(features.reshape(x.shape[0], -1).detach().cpu().numpy())
        labels.append(y.numpy())

    np.save(f"X_{file}.npy", np.concatenate(images, axis=0))
    np.save(f"y_{file}.npy", np.concatenate(labels, axis=0))
    model.train()

In [None]:
model = EfficientNet.from_pretrained("efficientnet-b7")
model._fc = nn.Linear(2560, 1)
train_dataset = CatDog(root="../input/cats-and-dogs-embedded-data/train/train", transform = basic_transform)
test_dataset = CatDog(root="../input/cats-and-dogs-embedded-data/test/test", transform = basic_transform)

train_loader = DataLoader(
        train_dataset,
        shuffle=True,
        batch_size = BATCH_SIZE,
        num_workers = NUM_WORKERS,
        pin_memory=True,
        )
test_loader = DataLoader(
    test_dataset,
    shuffle=False,
    batch_size= BATCH_SIZE,
    num_workers= NUM_WORKERS,
    )

model = model.to(DEVICE)

save_feature_vectors(model, train_loader, output_size=(1, 1), file="train_b7")
save_feature_vectors(model, test_loader, output_size=(1, 1), file="test_b7")

Sau khi đã có vector các feature thì đưa vào clasifier. Có thể thử với Logistic Regression, SVC, Tree-based(i.e. Random Forest)

In [None]:
X = np.load(f'../input/data-features/X_train_b7.npy')
y = np.load(f'../input/data-features/y_train_b7.npy')

# Split data and train classifier
print(f"Training data shape: {X.shape}, labels shape: {y.shape}")
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.001, random_state=1337)
clf = LogisticRegression(max_iter=2000)
clf.fit(X_train, y_train)

In [None]:
# Validation
val_preds= clf.predict_proba(X_val)[:,1]
print(f"On validation set:")
print(f"Accuracy: {clf.score(X_val, y_val)}")
print(f"LOG LOSS: {log_loss(y_val, val_preds)} ")
print("%--------------------------------------------------%")

# Predict
print("Getting predictions for test set")
X_test = np.load(f'../input/data-features/X_test_b7.npy')
X_test_preds = clf.predict_proba(X_test)[:,1]
df = pd.DataFrame({'id': np.arange(1, 12501), 'label': np.clip(X_test_preds, 0.005, 0.995)})
df.to_csv(f"mysubmission.csv", index=False)
print("Done getting predictions!")