In [8]:
import numpy as np
import pandas as pd

### Abalone

In [2]:
column_names = ["sex", "length", "diameter", "height", "whole weight", 
                "shucked weight", "viscera weight", "shell weight", "rings"]
data = pd.read_csv("./abalone.data", names=column_names)
print("Number of samples: %d" % len(data))

## Combine Classes 1, 2, 3, 21-29
combine_classes = [1, 2, 3, 4] + [i for i in range(16, 30)]
Y = data.rings.values
for i in range(len(Y)):
    if Y[i] in combine_classes:
        Y[i] = 0
    else:
        Y[i] += -4

### Label Sex as I-0, F-1, M-2

data.sex[data.sex == "I"] = 0
data.sex[data.sex == "F"] = 1
data.sex[data.sex == "M"] = 2

data.rings = Y

full_data = data.values.astype(np.float)
np.random.shuffle(full_data)
X = full_data[:, :len(full_data[0])-1]
Y = full_data[:, -1].astype(int)
final_data = {'X': X, 'Y': Y}

# np.save("abalone_data.npy", final_data, allow_pickle=True)
print("DONE!!")

Number of samples: 4177
DONE!!


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [4]:
len(np.unique(Y))

12

### Glass

In [7]:
data = pd.read_csv("./glass.data", header=None)
print("Number of samples: %d" % len(data))

full_data = data.values.astype(np.float)
np.random.shuffle(full_data)
X = full_data[:, 1:len(full_data[0])-1]
Y = full_data[:, -1].astype(int) - 1
Y[Y>2] = Y[Y>2]-1
final_data = {'X': X, 'Y': Y}

np.save("glass_data.npy", final_data, allow_pickle=True)
print("DONE!!")

Number of samples: 214
DONE!!


In [8]:
np.unique(Y)

array([0, 1, 2, 3, 4, 5])

In [9]:
X.shape

(214, 9)

### Page Blocks

In [10]:
data = pd.read_csv("./page-blocks.data", header=None)
print("Number of samples: %d" % len(data))

full_data = data.values.astype(np.float)
np.random.shuffle(full_data)
X = full_data[:, 1:len(full_data[0])-1]
Y = full_data[:, -1].astype(int) - 1
final_data = {'X': X, 'Y': Y}

# np.save("pageblocks_data.npy", final_data, allow_pickle=True)
print("DONE!!")

Number of samples: 5473
DONE!!


In [11]:
np.unique(Y)

array([0, 1, 2, 3, 4])

In [12]:
X.shape

(5473, 10)

### Adult

In [5]:
data_dict = np.load("adult_data.npy", allow_pickle=True).item()
# X = np.concatenate((data_dict['X_train'], data_dict['X_test']), axis = 0)
# Y = np.concatenate((data_dict['y_train'], data_dict['y_test']), axis = 0)
# final_data = {'X': X, 'Y': Y}

# np.save("adult_new_data.npy", final_data, allow_pickle=True)
# print("DONE!!")

X = data_dict['X']
Y = data_dict['Y']

In [6]:
X.shape

(48842, 123)

### Cifar 10

In [1]:
def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

In [2]:
dict_batch = unpickle("./cifar-10-batches-py/data_batch_1")

In [4]:
for key in dict_batch:
    print(key)

b'batch_label'
b'labels'
b'data'
b'filenames'


In [5]:
dict_batch[b'data']

array([[ 59,  43,  50, ..., 140,  84,  72],
       [154, 126, 105, ..., 139, 142, 144],
       [255, 253, 253, ...,  83,  83,  84],
       ...,
       [ 71,  60,  74, ...,  68,  69,  68],
       [250, 254, 211, ..., 215, 255, 254],
       [ 62,  61,  60, ..., 130, 130, 131]], dtype=uint8)

In [15]:
data = []
labels = []

for i in range(1, 6):
    dict_batch = unpickle("./cifar-10-batches-py/data_batch_" + str(i))
    data.append(dict_batch[b'data'])
    labels.append(dict_batch[b'labels'])

dict_batch = unpickle("./cifar-10-batches-py/test_batch")
data.append(dict_batch[b'data'])
labels.append(dict_batch[b'labels'])

In [16]:
data_full = np.concatenate(data, axis = 0)
labels_full = np.concatenate(labels, axis = 0)

In [17]:
data_full.shape

(60000, 3072)

In [18]:
labels_full.shape

(60000,)

In [19]:
final_data = {'X': data_full, 'Y': labels_full}

np.save("cifar_data.npy", final_data, allow_pickle=True)
print("DONE!!")

DONE!!


In [21]:
import torchvision.transforms as transforms
from torchvision.datasets import CIFAR10
from torch.utils.data import DataLoader

In [22]:
mean = [0.4914, 0.4822, 0.4465]
std = [0.2023, 0.1994, 0.2010]

In [24]:
def train_dataloader(self):
    transform_train = transforms.Compose([transforms.RandomCrop(32, padding=4),
                                          transforms.RandomHorizontalFlip(),
                                          transforms.ToTensor(),
                                          transforms.Normalize(self.mean, self.std)])
    dataset = CIFAR10(root=self.hparams.data_dir, train=True, transform=transform_train)
    dataloader = DataLoader(dataset, batch_size=len(dataset), shuffle=True, drop_last=True)
    return dataloader