# Data Sampling and Shuffling

### Data Splitting without Random Seed

In [10]:
import numpy as np

x_toydata = np.array([1., 2., 3., 4., 5., 6., 7., 8., 9.])
y_labels = np.array([ 0,  1,  0,  1, 1,  0,  0,  1,  0 ])

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    x_toydata, y_labels, test_size=0.3, shuffle=True, stratify=y_labels
)

print("X_train", X_train)
print("X_test", X_test)

X_train [9. 2. 4. 6. 1. 5.]
X_test [7. 3. 8.]


In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    x_toydata, y_labels, test_size=0.3, shuffle=True, stratify=y_labels
)

print("X_train", X_train)
print("X_test", X_test)

X_train [5. 4. 6. 3. 1. 2.]
X_test [7. 8. 9.]


### Data Splitting with Random Seed

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    x_toydata, y_labels, test_size=0.3, shuffle=True, stratify=y_labels,
    random_state=123
)

print("X_train", X_train)
print("X_test", X_test)

X_train [9. 2. 8. 3. 5. 7.]
X_test [4. 1. 6.]


In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    x_toydata, y_labels, test_size=0.3, shuffle=True, stratify=y_labels,
    random_state=123
)

print("X_train", X_train)
print("X_test", X_test)

X_train [9. 2. 8. 3. 5. 7.]
X_test [4. 1. 6.]


### K-fold without Random Seed

In [15]:
from sklearn.model_selection import StratifiedKFold

cv = StratifiedKFold(n_splits=3, shuffle=True)

for train_idx, valid_idx in cv.split(x_toydata, y_labels):
    print("Feature values", x_toydata[train_idx])

Feature values [2. 3. 4. 5. 6. 7.]
Feature values [1. 2. 4. 7. 8. 9.]
Feature values [1. 3. 5. 6. 8. 9.]


In [16]:
for train_idx, valid_idx in cv.split(x_toydata, y_labels):
    print("Feature values", x_toydata[train_idx])

Feature values [2. 3. 4. 7. 8. 9.]
Feature values [1. 2. 4. 5. 6. 9.]
Feature values [1. 3. 5. 6. 7. 8.]


### K-fold with Random Seed

In [17]:
cv = StratifiedKFold(n_splits=3, random_state=123, shuffle=True)

for train_idx, valid_idx in cv.split(x_toydata, y_labels):
    print("Feature values", x_toydata[train_idx])

Feature values [3. 4. 5. 6. 8. 9.]
Feature values [1. 2. 4. 5. 6. 7.]
Feature values [1. 2. 3. 7. 8. 9.]


In [18]:
cv = StratifiedKFold(n_splits=3, random_state=123, shuffle=True)

for train_idx, valid_idx in cv.split(x_toydata, y_labels):
    print("Feature values", x_toydata[train_idx])

Feature values [3. 4. 5. 6. 8. 9.]
Feature values [1. 2. 4. 5. 6. 7.]
Feature values [1. 2. 3. 7. 8. 9.]


## Dataset Loading without Random Seed

In [1]:
from torchvision import datasets, transforms
from torch.utils.data import DataLoader


train_dataset = datasets.MNIST(root='./data', train=True, transform=transforms.ToTensor(), download=True)
train_loader = DataLoader(dataset=train_dataset, batch_size=8, shuffle=True)

for inputs, labels in train_loader:
    pass
print(labels)

tensor([7, 6, 2, 5, 3, 8, 8, 6])


In [2]:
train_loader = DataLoader(dataset=train_dataset, batch_size=8, shuffle=True)

for inputs, labels in train_loader:
    pass
print(labels)

tensor([4, 6, 5, 2, 5, 2, 9, 3])


## Dataset Loading with Random Seed

In [3]:
import torch

torch.manual_seed(123)
train_loader = DataLoader(dataset=train_dataset, batch_size=8, shuffle=True)

for inputs, labels in train_loader:
    pass
print(labels)

tensor([1, 8, 8, 7, 2, 5, 4, 1])


In [4]:
torch.manual_seed(123)
train_loader = DataLoader(dataset=train_dataset, batch_size=8, shuffle=True)

for inputs, labels in train_loader:
    pass
print(labels)

tensor([1, 8, 8, 7, 2, 5, 4, 1])
