<a href="https://colab.research.google.com/github/rickiepark/MLQandAI/blob/main/supplementary/q10-random-sources/data-sampling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 데이터 샘플링과 셔플링

### 랜덤 시드 없이 데이터 분할하기

In [1]:
import numpy as np

x_toydata = np.array([1., 2., 3., 4., 5., 6., 7., 8., 9.])
y_labels = np.array([ 0,  1,  0,  1, 1,  0,  0,  1,  0 ])

In [2]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    x_toydata, y_labels, test_size=0.3, shuffle=True, stratify=y_labels
)

print("X_train", X_train)
print("X_test", X_test)

X_train [3. 2. 5. 9. 4. 7.]
X_test [1. 6. 8.]


In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    x_toydata, y_labels, test_size=0.3, shuffle=True, stratify=y_labels
)

print("X_train", X_train)
print("X_test", X_test)

X_train [2. 6. 3. 5. 7. 8.]
X_test [9. 1. 4.]


### 랜덤 시드를 사용해 데이터 분할하기

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    x_toydata, y_labels, test_size=0.3, shuffle=True, stratify=y_labels,
    random_state=123
)

print("X_train", X_train)
print("X_test", X_test)

X_train [9. 2. 8. 3. 5. 7.]
X_test [4. 1. 6.]


In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    x_toydata, y_labels, test_size=0.3, shuffle=True, stratify=y_labels,
    random_state=123
)

print("X_train", X_train)
print("X_test", X_test)

X_train [9. 2. 8. 3. 5. 7.]
X_test [4. 1. 6.]


### 랜덤 시드 없이 K-폴드 분할하기

In [6]:
from sklearn.model_selection import StratifiedKFold

cv = StratifiedKFold(n_splits=3, shuffle=True)

for train_idx, valid_idx in cv.split(x_toydata, y_labels):
    print("특성 값", x_toydata[train_idx])

특성 값 [2. 3. 4. 7. 8. 9.]
특성 값 [1. 2. 3. 5. 6. 8.]
특성 값 [1. 4. 5. 6. 7. 9.]


In [7]:
for train_idx, valid_idx in cv.split(x_toydata, y_labels):
    print("특성 값", x_toydata[train_idx])

특성 값 [2. 3. 4. 5. 6. 9.]
특성 값 [1. 2. 5. 6. 7. 8.]
특성 값 [1. 3. 4. 7. 8. 9.]


### 랜덤 시드를 사용해 K-폴드 분할하기

In [8]:
cv = StratifiedKFold(n_splits=3, random_state=123, shuffle=True)

for train_idx, valid_idx in cv.split(x_toydata, y_labels):
    print("특성 값", x_toydata[train_idx])

특성 값 [3. 4. 5. 6. 8. 9.]
특성 값 [1. 2. 4. 5. 6. 7.]
특성 값 [1. 2. 3. 7. 8. 9.]


In [9]:
cv = StratifiedKFold(n_splits=3, random_state=123, shuffle=True)

for train_idx, valid_idx in cv.split(x_toydata, y_labels):
    print("특성 값", x_toydata[train_idx])

특성 값 [3. 4. 5. 6. 8. 9.]
특성 값 [1. 2. 4. 5. 6. 7.]
특성 값 [1. 2. 3. 7. 8. 9.]


## 랜덤 시드 없이 데이터셋 로드하기

In [10]:
from torchvision import datasets, transforms
from torch.utils.data import DataLoader


train_dataset = datasets.MNIST(root='./data', train=True, transform=transforms.ToTensor(), download=True)
train_loader = DataLoader(dataset=train_dataset, batch_size=8, shuffle=True)

for inputs, labels in train_loader:
    pass
print(labels)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9912422/9912422 [00:00<00:00, 10639013.44it/s]


Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28881/28881 [00:00<00:00, 346639.77it/s]


Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1648877/1648877 [00:00<00:00, 3201749.69it/s]


Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4542/4542 [00:00<00:00, 5121109.88it/s]


Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw

tensor([1, 1, 7, 0, 8, 1, 3, 8])


In [11]:
train_loader = DataLoader(dataset=train_dataset, batch_size=8, shuffle=True)

for inputs, labels in train_loader:
    pass
print(labels)

tensor([8, 0, 6, 5, 6, 2, 0, 1])


## 랜덤 시드를 사용해 데이터셋 로드하기

In [12]:
import torch

torch.manual_seed(123)
train_loader = DataLoader(dataset=train_dataset, batch_size=8, shuffle=True)

for inputs, labels in train_loader:
    pass
print(labels)

tensor([1, 8, 8, 7, 2, 5, 4, 1])


In [13]:
torch.manual_seed(123)
train_loader = DataLoader(dataset=train_dataset, batch_size=8, shuffle=True)

for inputs, labels in train_loader:
    pass
print(labels)

tensor([1, 8, 8, 7, 2, 5, 4, 1])
