# Data
> Functions used to create pytorch `DataSet`s and `DataLoader`s.

In [1]:
# default_exp data

In [2]:
# export
import torch
from torch.utils.data import Dataset, DataLoader
from fastai.data_block import DataBunch, DatasetType

from sklearn.model_selection import train_test_split

# import warnings

# torch.Tensor.ndim = property(lambda x: x.dim())
# tt = torch.Tensor

In [3]:
# export
class Data(Dataset):
    """
    Load raw x,y data
    """
    def __init__(self, *args):
        super().__init__()
        self.data = args
        
    def __len__(self):
        return len(self.data[0])
    
    def __getitem__(self, i):
        return {f'arg_{i}': torch.Tensor([x[i]]) 
                for i, x in enumerate(self.data)}

In [4]:
# export
class TestData(Dataset):
    """
    Load raw x,y data
    """
    def __init__(self, x):
        super().__init__()
        self.x = x
        self.x_type = self.__get_type__(x.dtype.name)
        
    def __get_type__(self, type):
        if type.startswith('float'):
            return torch.float32
        if type.startswith('int'):
            return torch.long
        
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, i):
        return torch.tensor(self.x[i], dtype=torch.float32)

class TrainData(TestData):
    """
    Load raw x,y data
    """
    def __init__(self, x, y):
        super().__init__(x)
        self.y = y
        self.y_type = self.__get_type__(y.dtype.name)
    
    def __getitem__(self, i):
        return torch.tensor(self.x[i], dtype=self.x_type), \
                torch.tensor(self.y[i], dtype=self.y_type)

In [5]:
import numpy as np

x = np.random.randn(100, 3)
y = np.random.randn(100, 1)
train_ds = TrainData(x, y)

x_elem, y_elem = train_ds[99]
print(x_elem, x_elem.dtype)
print(y_elem, y_elem.dtype)

tensor([-1.0572,  0.8800,  1.2648]) torch.float32
tensor([1.3466]) torch.float32


In [6]:
y = np.random.randint(0, 3, (100, 1))
train_ds = TrainData(x, y)

x_elem, y_elem = train_ds[99]
print(x_elem, x_elem.dtype)
print(y_elem, y_elem.dtype)

tensor([-1.0572,  0.8800,  1.2648]) torch.float32
tensor([1]) torch.int64


In [7]:
# export
def create_db(x, y, train_size=0.8, bs=96, random_state=42):
    """
    Take dataframe and convert to Fastai databunch
    """
    X_train, X_test, y_train, y_test = train_test_split(x, y, train_size=train_size)
    
    train_ds = TrainData(X_train, y_train)
    val_ds = TrainData(X_test, y_test)
    
    bs = min(bs, len(train_ds))
    val_bs = min(bs, len(val_ds))
    
    train_dl = DataLoader(train_ds, bs)
    val_dl = DataLoader(val_ds, val_bs)

    return DataBunch(train_dl, val_dl)

In [8]:
x = np.random.randn(100, 3)
y = np.random.randn(100, 1)
db = create_db(x,y, bs=10)

In [9]:
[x.shape for x in db.one_batch()]

[torch.Size([10, 3]), torch.Size([10, 1])]

In [10]:
# hide
from nbdev.export import *
notebook2script()

Converted Activations.ipynb.
Converted Layers.ipynb.
Converted Model.ipynb.
Converted data.ipynb.
Converted index.ipynb.


In [11]:
!cat keraTorch/data.py

# AUTOGENERATED! DO NOT EDIT! File to edit: data.ipynb (unless otherwise specified).

__all__ = ['Data', 'TestData', 'TrainData', 'create_db']

# Cell
import torch
from torch.utils.data import Dataset, DataLoader
from fastai.data_block import DataBunch, DatasetType

from sklearn.model_selection import train_test_split


# torch.Tensor.ndim = property(lambda x: x.dim())
# tt = torch.Tensor

# Cell
class Data(Dataset):
    """
    Load raw x,y data
    """
    def __init__(self, *args):
        super().__init__()
        self.data = args

    def __len__(self):
        return len(self.data[0])

    def __getitem__(self, i):
        return {f'arg_{i}': torch.Tensor([x[i]])
                for i, x in enumerate(self.data)}

# Cell
class TestData(Dataset):
    """
    Load raw x,y data
    """
    def __init__(self, x):
        super().__init__()
        self.x = x
        self.x_type = self.__get_type__(x.dtype.name)

    def __get_type__(self, type