In [45]:
import imageio
import torch
import csv
from pathlib import Path
import numpy as np

## Example 1 - Images

In [24]:
img_arr = imageio.imread('../data/p1ch4/image-dog/bobby.jpg')
img_arr.shape

(720, 1280, 3)

PyTorch requires: Color x Height x Weight

In [4]:
img = torch.from_numpy(img_arr)
out = img.permute(2,0,1)

In [5]:
batch_size = 3
batch = torch.zeros(batch_size, 3, 256, 256, dtype=torch.uint8)

Tensor: Batch x Color x Height x Weight

In [6]:
data_dir = Path('../data/p1ch4/image-cats/')
filenames = list(data_dir.glob('*.png'))

In [7]:
for idx, filename in enumerate(filenames):
    img_arr = imageio.imread(filename)
    img_t = torch.from_numpy(img_arr)
    img_t = img_t.permute(2,0,1)
    img_t = img_t[:3]
    batch[idx] = img_t

In [8]:
# Normalize the data

Normalize between 0,1

In [9]:
batch = batch.float()
batch /= 255.0

In [10]:
batch

tensor([[[[0.7922, 0.7569, 0.7451,  ..., 0.0510, 0.0510, 0.0471],
          [0.7804, 0.7529, 0.7412,  ..., 0.0549, 0.0549, 0.0549],
          [0.7765, 0.7569, 0.7373,  ..., 0.0471, 0.0471, 0.0471],
          ...,
          [0.3647, 0.3216, 0.2980,  ..., 0.1412, 0.1412, 0.1412],
          [0.2941, 0.2667, 0.3961,  ..., 0.1412, 0.1412, 0.1451],
          [0.3333, 0.4039, 0.3529,  ..., 0.1412, 0.1451, 0.1490]],

         [[0.5922, 0.5451, 0.5216,  ..., 0.0353, 0.0353, 0.0314],
          [0.5922, 0.5490, 0.5255,  ..., 0.0431, 0.0431, 0.0431],
          [0.5961, 0.5608, 0.5255,  ..., 0.0431, 0.0431, 0.0431],
          ...,
          [0.2235, 0.1765, 0.1529,  ..., 0.1020, 0.1020, 0.1020],
          [0.1294, 0.1020, 0.2314,  ..., 0.1020, 0.1020, 0.1059],
          [0.1569, 0.2275, 0.1765,  ..., 0.1020, 0.1059, 0.1098]],

         [[0.2667, 0.2078, 0.1725,  ..., 0.0235, 0.0235, 0.0196],
          [0.2627, 0.2118, 0.1725,  ..., 0.0235, 0.0235, 0.0235],
          [0.2627, 0.2196, 0.1725,  ..., 0

Normalize between -1,1

In [11]:
n_channels = batch.shape[1]
for color in range(n_channels):
    mean = torch.mean(batch[:, color])
    std = torch.std(batch[:, color])
    batch[:, color] = (batch[:, color]-mean)/std

# Example 2: Volumetric Data

In [32]:
dir_path = Path("../data/p1ch4/volumetric-dicom/")
dir_path = [x for x in dir_path.iterdir() if x.is_dir()][0]

In [34]:
vol_arr = imageio.volread(dir_path, 'DICOM')
vol_arr.shape

Reading DICOM (examining files): 1/99 files (1.0%99/99 files (100.0%)
  Found 1 correct series.
Reading DICOM (loading data): 76/99  (76.899/99  (100.0%)


(99, 512, 512)

In [35]:
## No room for a chanel so we need to unsqueeze a dimension (Add a dimension)

In [36]:
vol = torch.from_numpy(vol_arr).float()
vol = torch.unsqueeze(vol, 0)

In [38]:
vol.shape

torch.Size([1, 99, 512, 512])

Current channels = Color x Depth x Height x Weight

# Example 3 - Tabular Data

In [40]:
wine_path = Path("../data/p1ch4/tabular-wine/winequality-white.csv")

In [44]:
wineq_np = np.loadtxt(wine_path, dtype=np.float32, delimiter=";", skiprows=1)

In [46]:
col_list = next(csv.reader(open(wine_path), delimiter=';'))

In [47]:
wineq_np.shape, col_list

((4898, 12),
 ['fixed acidity',
  'volatile acidity',
  'citric acid',
  'residual sugar',
  'chlorides',
  'free sulfur dioxide',
  'total sulfur dioxide',
  'density',
  'pH',
  'sulphates',
  'alcohol',
  'quality'])

In [48]:
wineq = torch.from_numpy(wineq_np)

In [49]:
wineq.shape, wineq.dtype

(torch.Size([4898, 12]), torch.float32)

In [51]:
data = wineq[:, :-1]
data, data.shape

(tensor([[ 7.0000,  0.2700,  0.3600,  ...,  3.0000,  0.4500,  8.8000],
         [ 6.3000,  0.3000,  0.3400,  ...,  3.3000,  0.4900,  9.5000],
         [ 8.1000,  0.2800,  0.4000,  ...,  3.2600,  0.4400, 10.1000],
         ...,
         [ 6.5000,  0.2400,  0.1900,  ...,  2.9900,  0.4600,  9.4000],
         [ 5.5000,  0.2900,  0.3000,  ...,  3.3400,  0.3800, 12.8000],
         [ 6.0000,  0.2100,  0.3800,  ...,  3.2600,  0.3200, 11.8000]]),
 torch.Size([4898, 11]))

In [56]:
target = wineq[:, -1].long()
target, target.shape

(tensor([6, 6, 6,  ..., 6, 7, 6]), torch.Size([4898]))

In [57]:
target_onehot = torch.zeros(target.shape[0], 10)
target_onehot.scatter_(1, target.unsqueeze(1), 1.0)

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 1., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

Ending in _ indicates the method will not return a new tensor, but will modify the tensor in place

In [74]:
target.unique()

tensor([3, 4, 5, 6, 7, 8, 9])

In [60]:
target.unsqueeze(1)

tensor([[6],
        [6],
        [6],
        ...,
        [6],
        [7],
        [6]])

With unsqueeze we add a singleton dimension. So from a 1-D tensor (4898) to a 2D tensor (4898 x 1)

In [66]:
data_mean = torch.mean(data, dim=0)
data_mean

tensor([6.8548e+00, 2.7824e-01, 3.3419e-01, 6.3914e+00, 4.5772e-02, 3.5308e+01,
        1.3836e+02, 9.9403e-01, 3.1883e+00, 4.8985e-01, 1.0514e+01])

In [65]:
data_var = torch.var(data, dim=0)
data_var

tensor([7.1211e-01, 1.0160e-02, 1.4646e-02, 2.5726e+01, 4.7733e-04, 2.8924e+02,
        1.8061e+03, 8.9455e-06, 2.2801e-02, 1.3025e-02, 1.5144e+00])

In [68]:
data_normalized = (data - data_mean)/torch.sqrt(data_var)
data_normalized

tensor([[ 1.7209e-01, -8.1764e-02,  2.1325e-01,  ..., -1.2468e+00,
         -3.4914e-01, -1.3930e+00],
        [-6.5743e-01,  2.1587e-01,  4.7991e-02,  ...,  7.3992e-01,
          1.3467e-03, -8.2418e-01],
        [ 1.4756e+00,  1.7448e-02,  5.4378e-01,  ...,  4.7502e-01,
         -4.3677e-01, -3.3662e-01],
        ...,
        [-4.2042e-01, -3.7940e-01, -1.1915e+00,  ..., -1.3131e+00,
         -2.6152e-01, -9.0544e-01],
        [-1.6054e+00,  1.1666e-01, -2.8253e-01,  ...,  1.0048e+00,
         -9.6250e-01,  1.8574e+00],
        [-1.0129e+00, -6.7703e-01,  3.7852e-01,  ...,  4.7502e-01,
         -1.4882e+00,  1.0448e+00]])

### Find thresholds

In [70]:
bad_indexes = target <=3
bad_indexes.shape, bad_indexes.dtype, bad_indexes.sum()

(torch.Size([4898]), torch.bool, tensor(20))

In [72]:
bad_data = data[bad_indexes]
bad_data.shape

torch.Size([20, 11])

In [79]:
bad_data = data[target <=3]
mid_data = data[(target > 3) & (target < 7)]
good_data = data[target >= 7]

In [80]:
bad_mean = torch.mean(bad_data, dim=0)
mid_mean = torch.mean(mid_data, dim=0)
good_mean = torch.mean(good_data, dim=0)

In [82]:
for i, args in enumerate(zip(col_list, bad_mean, mid_mean, good_mean)):
    print("{:2} {:20} {:6.2f} {:6.2f} {:6.2f}".format(i, *args))

 0 fixed acidity          7.60   6.89   6.73
 1 volatile acidity       0.33   0.28   0.27
 2 citric acid            0.34   0.34   0.33
 3 residual sugar         6.39   6.71   5.26
 4 chlorides              0.05   0.05   0.04
 5 free sulfur dioxide   53.33  35.42  34.55
 6 total sulfur dioxide 170.60 141.83 125.25
 7 density                0.99   0.99   0.99
 8 pH                     3.19   3.18   3.22
 9 sulphates              0.47   0.49   0.50
10 alcohol               10.34  10.26  11.42


# Time Series

In [85]:
bikes_np = np.loadtxt("../data/p1ch4/bike-sharing-dataset/hour-fixed.csv",
                      dtype=np.float32,
                      delimiter = ",",
                      skiprows = 1,
                      converters = {1: lambda x: float(x[8:10])}
                      )
bikes = torch.from_numpy(bikes_np)
bikes.shape

torch.Size([17520, 17])

In [86]:
bikes

tensor([[1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 3.0000e+00, 1.3000e+01,
         1.6000e+01],
        [2.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 8.0000e+00, 3.2000e+01,
         4.0000e+01],
        [3.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 5.0000e+00, 2.7000e+01,
         3.2000e+01],
        ...,
        [1.7377e+04, 3.1000e+01, 1.0000e+00,  ..., 7.0000e+00, 8.3000e+01,
         9.0000e+01],
        [1.7378e+04, 3.1000e+01, 1.0000e+00,  ..., 1.3000e+01, 4.8000e+01,
         6.1000e+01],
        [1.7379e+04, 3.1000e+01, 1.0000e+00,  ..., 1.2000e+01, 3.7000e+01,
         4.9000e+01]])

Need tensor of N x C x L 

N = Number of observations 

C = Feature Column

L = Hours

In [87]:
bikes.shape, bikes.stride()

(torch.Size([17520, 17]), (17, 1))

17520 hours and 17 columns. Reshape so we have 3 axes -> day, hour and our 17 columns:

In [88]:
daily_bikes = bikes.view(-1, 24, bikes.shape[1])

In [89]:
daily_bikes.shape, daily_bikes.stride()

(torch.Size([730, 24, 17]), (408, 17, 1))

Our call to view on a tensor returns a new tensor that changes the number of dimensions and stride info without changing the storage. Use -1 as a placeholder for "however many indexes are left, given the other dimensions and the original number of elements"

The stride is telling us that by advancing along the hour dimension requires us to advance by 17 places in storage. Advancing by the day dimension requires us to advance by a number of elements equal to the length of a row in the storage times 24 (408 = 17 x 24).

Currently aligned  N x L x C. (730, 24, 17). Need to alight it N x C x L. (730, 17, 24).

In [90]:
daily_bikes = daily_bikes.transpose(1,2)
daily_bikes.shape, daily_bikes.stride()

(torch.Size([730, 17, 24]), (408, 1, 17))

In [91]:
first_day = bikes[:24].long()

In [92]:
weather_onehot = torch.zeros(first_day.shape[0], 4)

In [93]:
first_day[:, 9]

tensor([1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 2, 2, 2, 2])

In [95]:
weather_onehot.scatter_(dim=1, 
                        index=first_day[:,9].unsqueeze(1).long() - 1, 
                        value=1.0)

tensor([[1., 0., 0., 0.],
        [1., 1., 0., 0.],
        [1., 0., 1., 0.],
        [1., 0., 0., 1.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.]])

In [96]:
torch.cat((bikes[:24], weather_onehot), 1)[:1]

tensor([[ 1.0000,  1.0000,  1.0000,  0.0000,  1.0000,  0.0000,  0.0000,  6.0000,
          0.0000,  1.0000,  0.2400,  0.2879,  0.8100,  0.0000,  3.0000, 13.0000,
         16.0000,  1.0000,  0.0000,  0.0000,  0.0000]])

In [99]:
daily_weather_onehot = torch.zeros(daily_bikes.shape[0], 4, daily_bikes.shape[2])
daily_weather_onehot.shape

torch.Size([730, 4, 24])

In [100]:
daily_weather_onehot.scatter_(1, daily_bikes[:,9,:].long().unsqueeze(1)-1, 1.0)
daily_weather_onehot.shape

torch.Size([730, 4, 24])

# Text

In [101]:
with open('../data/p1ch4/jane-austen/1342-0.txt', encoding='utf8') as f: 
    text = f.read()

In [102]:
lines = text.split("\n")
line = lines[200]
line

'“Impossible, Mr. Bennet, impossible, when I am not acquainted with him'

In [104]:
letter_t = torch.zeros(len(line), 128)
letter_t.shape

torch.Size([70, 128])

In [105]:
for i , letter in enumerate(line.lower().strip()):
    letter_index = ord(letter) if ord(letter) < 128 else 0
    letter_t[i][letter_index] = 1

In [108]:
def clean_words(input_str):
    punc = '.,;:"!?”“_-'
    word_list = input_str.lower().replace('\n',' ').split() 
    word_list = [word.strip(punc) for word in word_list] 
    return word_list

In [109]:
words_in_line = clean_words(line)

In [110]:
line, words_in_line

('“Impossible, Mr. Bennet, impossible, when I am not acquainted with him',
 ['impossible',
  'mr',
  'bennet',
  'impossible',
  'when',
  'i',
  'am',
  'not',
  'acquainted',
  'with',
  'him'])

In [111]:
word_list = sorted(set(clean_words(text)))
word2index_dict = {word:i for (i, word) in enumerate(word_list)}

len(word2index_dict), word2index_dict['impossible']

(7261, 3394)

In [112]:
word_t = torch.zeros(len(words_in_line), len(word2index_dict))
for i, word in enumerate(words_in_line):
    word_index = word2index_dict[word]
    word_t[i][word_index]=1
    print('{:2} {:4} {}'.format(i, word_index, word))

 0 3394 impossible
 1 4305 mr
 2  813 bennet
 3 3394 impossible
 4 7078 when
 5 3315 i
 6  415 am
 7 4436 not
 8  239 acquainted
 9 7148 with
10 3215 him


In [113]:
print(word_t.shape)

torch.Size([11, 7261])
