# Data
> Functions used to create pytorch `DataSet`s and `DataLoader`s.

In [1]:
# default_exp data

In [2]:
# export
import torch
from torch.utils.data import Dataset, DataLoader
from fastai.data_block import DataBunch, DatasetType

from sklearn.model_selection import train_test_split

# import warnings

# torch.Tensor.ndim = property(lambda x: x.dim())
# tt = torch.Tensor

In [3]:
# export
class Data(Dataset):
    """
    Load raw x,y data
    """
    def __init__(self, *args):
        super().__init__()
        self.data = args
        
    def __len__(self):
        return len(self.data[0])
    
    def __getitem__(self, i):
        return {f'arg_{i}': torch.Tensor([x[i]]) 
                for i, x in enumerate(self.data)}

In [4]:
# export
class TrainData(Dataset):
    """
    Load raw x,y data
    """
    def __init__(self, x, y):
        super().__init__()
        self.x, self.y = x, y
        
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, i):
        return torch.tensor(self.x[i]), torch.tensor(self.y[i])
    

class TestData(Dataset):
    """
    Load raw x,y data
    """
    def __init__(self, x):
        super().__init__()
        self.x = x
        
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, i):
        return torch.tensor(self.x[i])

In [7]:
import numpy as np

x = np.random.randn(100, 3)
y = np.random.randn(100, 1)
train_ds = TrainData(x, y)

train_ds[99]

(tensor([-0.9254, -2.5888, -0.7599], dtype=torch.float64),
 tensor([-0.0141], dtype=torch.float64))

In [8]:
# export
def create_db(x, y, train_size=0.8, bs=96, random_state=42):
    """
    Take dataframe and convert to Fastai databunch
    """
    X_train, X_test, y_train, y_test = train_test_split(x, y, train_size=train_size)
    
    train_ds = TrainData(X_train, y_train)
    val_ds = TrainData(X_test, y_test)
    
    bs = min(bs, len(train_ds))
    val_bs = min(bs, len(val_ds))
    
    train_dl = DataLoader(train_ds, bs)
    val_dl = DataLoader(val_ds, val_bs)

    return DataBunch(train_dl, val_dl)

In [9]:
x = np.random.randn(100, 3)
y = np.random.randn(100, 1)
db = create_db(x,y, bs=10)

In [10]:
[x.shape for x in db.one_batch()]

[torch.Size([10, 3]), torch.Size([10, 1])]

In [11]:
# hide
from nbdev.export import *
notebook2script()

Converted 00_core.ipynb.
Converted data.ipynb.
Converted index.ipynb.
