# DataLoader

- PyTorch의 DataLoader, Dataset 활용하기
- Iris dataset 활용

kaggle iris dataset

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris

iris = load_iris()

df = pd.DataFrame(iris.data)
df.columns = iris.feature_names
df['class'] = iris.target

In [2]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),class
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [4]:
## Prepare Dataset
# FloatTensor
data = torch.from_numpy(df.values).float()
#data.shape = torch.Size([150, 5])

In [6]:
# 데이터셋에서 feature 정보와 label 데이터를 분리하여 x,y 데이터를 생성
x = data[:,:4]
y = data[:,[-1]]
ratio = [.8, .2]

train_cnt = int(data.size(0) * ratio[0])
valid_cnt = data.size(0) - train_cnt
print(train_cnt, valid_cnt) # 120, 30

indices = torch.tensor(range(data.size(0)), dtype=torch.long)
x = torch.index_select(x, dim=0, index=indices).split([train_cnt, valid_cnt], dim=0)
y = torch.index_select(y, dim=0, index=indices).split([train_cnt, valid_cnt], dim=0)

120 30


In [14]:
data.shape

torch.Size([150, 5])

In [7]:
from torch.utils.data import Dataset, DataLoader

# Dataset 상속
class IrisDataset(Dataset):
    def __init__(self, data, labels):
        super().__init__()
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

# DataLoader
train_loader = DataLoader(dataset=IrisDataset(x[0],y[0]), batch_size=64, shuffle=True)
valid_loader = DataLoader(dataset=IrisDataset(x[1],y[1]), batch_size=64, shuffle=False)

## More Information

- 사용자 정의 PyTorch DataLoader 참고 [링크](https://tutorials.pytorch.kr/recipes/recipes/custom_dataset_transforms_loader.html)

In [None]:
class MyDataset(Dataset):
    
    def __init__(self, x_data, y_data, transform=None):
        
        self.x_data = x_data # 넘파이 배열이 들어온다.
        self.y_data = y_data # 넘파이 배열이 들어온다.
        self.transform = transform
        self.len = len(y_data)
    
    def __getitem__(self, index):
        sample = self.x_data[index], self.y_data[index]
        
        if self.transform:
            sample = self.transform(sample) 
        
        return sample 
    
    def __len__(self):
        return self.len  

In [None]:
# 이 때 위 기본 양식과 같이 사용하기 위해 call 함수를 사용한다.
# def __call__ 내의 원하는 전처리 작업을 프로그래밍 할 수 있다.

# 1. 텐서 변환
class ToTensor:
    def __call__(self, sample):
        inputs, labels = sample
        inputs = torch.FloatTensor(inputs) # 텐서로 변환
        inputs = inputs.permute(2,0,1) # 크기 변환
        return inputs, torch.LongTensor(labels) # 텐서로 변환

# 2. 선형식    
class LinearTensor:
    
    def __init__(self, slope=1, bias=0):
        self.slope = slope
        self.bias = bias     
        
    def __call__(self, sample):
        inputs, labels = sample
        inputs = self.slope*inputs + self.bias # ax+b 계산하기
        return inputs, labels 

# 원하는 전처리를 추가로 정의할 수 있다

In [None]:
trans = tr.Compose([ToTensor(),LinearTensor(2,5)]) # 텐서 변환 후 선형식 2x+5 연산
dataset1 = MyDataset(train_images,train_labels, transform=trans)
train_loader1 = DataLoader(dataset1, batch_size=10, shuffle=True)