# Data creation

In [None]:
!pip install -i https://test.pypi.org/simple/ supportlib
import supportlib.gettingdata as getdata

In [2]:
import wget
wget.download('https://storage.googleapis.com/kaggle-competitions-data/kaggle/6322/train-tif-v2.tar.7z?GoogleAccessId=web-data@kaggle-161607.iam.gserviceaccount.com&Expires=1554113835&Signature=iOdJjf1VkY5HCC7SxOnWDbHPBrn6qYILdLWrSIJ6fR6kwmNttSrriKrV%2FGY64%2F5irXjqcVA5HQa66ssT8nAPbGrXA9G%2FWJ2QFMSohnlPXeTwA%2Fr5UuuIZJmBH8I92eWFdDQYJwbKt0FgeCDPk0RlnQ8ND7b8x3oMQprdpVRLsYkwW5dYIMydijC%2Be0ouecEmvcHwX3UouQLbShiP4isGR9%2FM3vbscMA7GtbZsuFb8M2S%2BHcQknBIZ2G4VvoPk87uC8U9lyXfSKIzqedZwLp0GZEE5IAjz%2FHgGKi5eoxFcoNc0TxDqdHK0ramkDGKboQxewUs6%2BCtaZT0VvLfCT%2BDPQ%3D%3D')

'train-tif-v2.tar.7z'

In [3]:
wget.download('https://storage.googleapis.com/kaggle-competitions-data/kaggle/6322/train_v2.csv.zip?GoogleAccessId=web-data@kaggle-161607.iam.gserviceaccount.com&Expires=1554113867&Signature=ElUSCwuLGZJgAfyrMgDPS2xe5qSfjLh5D0sdJjWPKhj5O73%2BBF6zpyvt%2FrCGke%2Bk9DaDW2mjutCuO9XTFtPl5NXVbLmj3474BvGXIn%2FquSKhi7H%2BkZwCQVuQxpwC4Ol30ARbPjXw1sm2gUthMfbveEymKjOvNn6SA5k%2BGQ75opXt%2F84gkpqQ0Zb9QnmgDZUSW94HTJA6iz1uCl7a3IkGc1f3N5L5f2b7fh8hQ%2FBuQpPGlwbVoRuQRrKBIaFTan5CSACkFr21TjivF7Fwr6reJbn8MCCevvYm5xTiVM3ky4od7NuWeGcioh38vwgObUpc0FgRauHitHhYF6XsqZgTOQ%3D%3D')

'train_v2.csv.zip'

In [5]:
getdata.zipextract('./train_v2.csv.zip')

In [None]:
!pip install patool
!pip install pyunpack
!mkdir train

In [0]:
Archive('./train-tif-v2.tar (1).7z').extractall('./train')

# Parameters

In [42]:
im_size = 224
img_dir = './train-tif-v2'
batch_size = 16
epoch = 10
valid_size = 0.1
test_size = 0.2

# Data loader

In [41]:
import pandas as pd
import torch
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
from torchvision import transforms
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data.sampler import SubsetRandomSampler
import os
import cv2
from sklearn.preprocessing import MultiLabelBinarizer
import torchvision.models as models
data = pd.read_csv('./train_v2.csv')

In [3]:
tags = data['tags']
tags = tags.str.split()
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(tags)

In [4]:
class Amazon_dataset(Dataset):
    def __init__(self,image_dir,y_train,transform = None):

        self.img_dir = image_dir
        self.y_train = y_train
        self.transform = transform
        self.id = os.listdir(self.img_dir)
    def __len__(self):
        return len(os.listdir(self.img_dir))
    def __getitem__(self,idx):
        img_name = os.path.join(self.img_dir, self.id[idx])
        image = cv2.imread(img_name)
        if self.transform:
            image = self.transform(image)
        label = torch.from_numpy(self.y_train[idx])
        return image,label

In [5]:
# Data transform
transform = transforms.Compose([transforms.ToPILImage(),
                                transforms.Resize((im_size,im_size)),
                                transforms.ToTensor(),
                                transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
                    ])
#Data laoder
amazon_data = Amazon_dataset(img_dir,y_train,transform)
dataloader = DataLoader(amazon_data, batch_size)

In [45]:
import numpy as np
data_len = len(amazon_data)
indices = list(range(data_len))
np.random.shuffle(indices)
split1 = int(np.floor(valid_size * data_len))
split2 = int(np.floor(test_size * data_len))
valid_idx , test_idx, train_idx = indices[:split1], indices[split1:split2] , indices[split2:] 
train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)
test_sampler = SubsetRandomSampler(test_idx)
train_loader = DataLoader(amazon_data, batch_size=batch_size , sampler=train_sampler)
valid_loader = DataLoader(amazon_data, batch_size=batch_size , sampler=valid_sampler)
test_loader = DataLoader(amazon_data, batch_size=batch_size , sampler=test_sampler)

## Calculate mean and standard deviation of custom dataset

In [None]:
mean = 0.
std = 0.
from tqdm import tqdm
nb_samples = len(amazon_data)
for data,_ in tqdm(dataloader):
    batch_samples = data.size(0)
    data = data.view(batch_samples, data.size(1), -1)
    mean += data.mean(2).sum(0)
    std += data.std(2).sum(0)
    
mean /= nb_samples
std /= nb_samples

# Model 

In [6]:
class Classifier(nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()
        self.resnet = models.resnet50(pretrained = True)
        self.num_ftrs = self.resnet.fc.in_features
        self.l1 = nn.Linear(1000 , 256)
        self.l2 = nn.Linear(256,17)
    def forward(self, input):
        x = self.resnet(input)
        x = x.view(x.size(0),-1)
        x = F.relu(self.l1(x))
        x = F.sigmoid(self.l2(x))
        return x

In [7]:
from torchsummary import summary
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
classifier = Classifier().to(device)
summary(classifier,(3,224,224))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 112, 112]           9,408
       BatchNorm2d-2         [-1, 64, 112, 112]             128
              ReLU-3         [-1, 64, 112, 112]               0
         MaxPool2d-4           [-1, 64, 56, 56]               0
            Conv2d-5           [-1, 64, 56, 56]           4,096
       BatchNorm2d-6           [-1, 64, 56, 56]             128
              ReLU-7           [-1, 64, 56, 56]               0
            Conv2d-8           [-1, 64, 56, 56]          36,864
       BatchNorm2d-9           [-1, 64, 56, 56]             128
             ReLU-10           [-1, 64, 56, 56]               0
           Conv2d-11          [-1, 256, 56, 56]          16,384
      BatchNorm2d-12          [-1, 256, 56, 56]             512
           Conv2d-13          [-1, 256, 56, 56]          16,384
      BatchNorm2d-14          [-1, 256,



In [8]:
optimizer = optim.SGD(classifier.parameters(), lr=0.001, momentum=0.5)

# Train

In [None]:
for i in range(epoch):
    y_pred = []
    y_true = []
    for batch_idx, (data, target) in enumerate(dataloader):
        data, target = Variable(data), Variable(target)
        data = data.type(torch.cuda.FloatTensor)
        target = target.type(torch.cuda.FloatTensor)
        optimizer.zero_grad()
        output = classifier(data)
        loss = F.binary_cross_entropy(output, target)
        loss.backward()
        optimizer.step()
        output = output.cpu().detach().numpy()
        y_pred.append(output)
        target = target.cpu().numpy()
        y_true.append(target)
        if batch_idx % 100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(i, batch_idx * len(data), len(dataloader.dataset),100. * batch_idx / len(dataloader), loss.item()))
    y_pred = get_pred(y_pred)
    f_score = get_fscore(y_true,y_pred)
    print('Train Epoch: {} \tf1_score: {:.6f}'.format(epoch , f_score))



In [164]:
def get_pred(y_pred):
    l = len(y_pred[i])
    y_pred = y_pred[0:l-1]
    y_pred = np.asarray(y_pred)
    for i in range(len(y_pred)):
        for j in range(16):
            for k in range(17):
                try:
                    if(y_pred[i][j][k]>=0.5):
                        y_pred[i][j][k] = 1
                    else:
                        y_pred[i][j][k] = 0
                except:
                    print(y_pred.shape)
    return y_pred

In [165]:
def get_fscore(y_true,y_pred):
    leng = len(y_true)
    y_true = y_true[0:leng-1]
    y_true = np.asarray(y_true)
    leng = len(y_true)
    siz = leng*16
    print(y_true.shape)
    print(y_pred.shape)
    
    y_true = np.reshape(y_true,(siz,17))
    y_pred = np.reshape(y_pred,(siz,17))
    
    y_pred = y_pred.astype(int)
    y_true = y_true.astype(int)
    f1 = f1_score(y_true=y_true, y_pred=y_pred, average='weighted')
    return f1