In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
torch.set_printoptions(edgeitems=2)
torch.manual_seed(123)

<torch._C.Generator at 0x7fdd22e71c90>

## Image and 3D volumetric data

In [2]:
import imageio

In [3]:
img_arr = imageio.imread("dlwpt-code/data/p1ch4/image-dog/bobby.jpg")
img_arr.shape

(720, 1280, 3)

In [4]:
## converting the img_arr to tensor
img = torch.from_numpy(img_arr)
##pytorch requires images to be in (channel, height, width) 
##but ours is (height, width, channel) so we shuffle
out = img.permute(2, 0, 1)

In [5]:
batch_size = 3
batch = torch.zeros(batch_size, 3, 256, 256, dtype=torch.uint8)

In [6]:
import os
data_dir = 'dlwpt-code/data/p1ch4/image-cats/'
filenames = [name for name in os.listdir(data_dir)
if os.path.splitext(name)[-1] == '.png']
for i, filename in enumerate(filenames):
    img_arr = imageio.imread(os.path.join(data_dir, filename))
    img_t = torch.from_numpy(img_arr)
    img_t = img_t.permute(2, 0, 1)
    img_t = img_t[:3] # keep only first three channel cause the rest might have extra info like transparency which we dont need
    batch[i] = img_t

In [7]:
## normalization
batch = batch.float()
batch /= 255.0

In [8]:
# for 3d data
dir_path = r"dlwpt-code/data/p1ch4/volumetric-dicom/2-LUNG3.0B70f-04083/"
vol_arr = imageio.volread(dir_path, 'DICOM')
vol_arr.shape

Reading DICOM (examining files): 1/99 files (1.0%99/99 files (100.0%)
  Found 1 correct series.
Reading DICOM (loading data): 73/99  (73.799/99  (100.0%)


(99, 512, 512)

In [9]:
vol = torch.from_numpy(vol_arr).float()
vol = torch.unsqueeze(vol, 0)
vol.shape

torch.Size([1, 99, 512, 512])

## CSV data

In [10]:
## using a csv file
import csv
wine_path = "dlwpt-code/data/p1ch4/tabular-wine/winequality-white.csv"
wineq_numpy = np.loadtxt(wine_path, dtype=np.float32, delimiter=";", skiprows=1)

wineq_numpy

array([[ 7.  ,  0.27,  0.36, ...,  0.45,  8.8 ,  6.  ],
       [ 6.3 ,  0.3 ,  0.34, ...,  0.49,  9.5 ,  6.  ],
       [ 8.1 ,  0.28,  0.4 , ...,  0.44, 10.1 ,  6.  ],
       ...,
       [ 6.5 ,  0.24,  0.19, ...,  0.46,  9.4 ,  6.  ],
       [ 5.5 ,  0.29,  0.3 , ...,  0.38, 12.8 ,  7.  ],
       [ 6.  ,  0.21,  0.38, ...,  0.32, 11.8 ,  6.  ]], dtype=float32)

In [11]:
col_list = next(csv.reader(open(wine_path), delimiter=';'))
wineq_numpy.shape, col_list

((4898, 12),
 ['fixed acidity',
  'volatile acidity',
  'citric acid',
  'residual sugar',
  'chlorides',
  'free sulfur dioxide',
  'total sulfur dioxide',
  'density',
  'pH',
  'sulphates',
  'alcohol',
  'quality'])

In [12]:
wineq = torch.from_numpy(wineq_numpy)
wineq.shape, wineq.dtype

(torch.Size([4898, 12]), torch.float32)

In [13]:
data = wineq[:,:-1]

In [14]:
target = wineq[:, -1].long()
#allows us to treat them as labels

In [15]:
target_onehot = torch.zeros(target.shape[0], 10)
target_onehot.scatter_(1, target.unsqueeze(1), 1)

tensor([[0., 0.,  ..., 0., 0.],
        [0., 0.,  ..., 0., 0.],
        ...,
        [0., 0.,  ..., 0., 0.],
        [0., 0.,  ..., 0., 0.]])

In [16]:
data_mean = torch.mean(data, dim=0)
data_mean

tensor([6.8548e+00, 2.7824e-01, 3.3419e-01, 6.3914e+00, 4.5772e-02, 3.5308e+01,
        1.3836e+02, 9.9403e-01, 3.1883e+00, 4.8985e-01, 1.0514e+01])

In [17]:
data_var = torch.var(data, dim=0)
data_var

tensor([7.1211e-01, 1.0160e-02, 1.4646e-02, 2.5726e+01, 4.7733e-04, 2.8924e+02,
        1.8061e+03, 8.9455e-06, 2.2801e-02, 1.3025e-02, 1.5144e+00])

In [18]:
data_normalised = (data-data_mean)/torch.sqrt(data_var)

In [19]:
data_normalised

tensor([[ 1.7208e-01, -8.1761e-02,  ..., -3.4915e-01, -1.3930e+00],
        [-6.5743e-01,  2.1587e-01,  ...,  1.3422e-03, -8.2419e-01],
        ...,
        [-1.6054e+00,  1.1666e-01,  ..., -9.6251e-01,  1.8574e+00],
        [-1.0129e+00, -6.7703e-01,  ..., -1.4882e+00,  1.0448e+00]])

In [20]:
bad_indexes = target <= 3
bad_indexes.shape, bad_indexes.dtype, bad_indexes.sum()

(torch.Size([4898]), torch.bool, tensor(20))

In [21]:
bad_data = data[bad_indexes]
bad_data.shape

torch.Size([20, 11])

In [22]:
bad_data = data[target <= 3]
mid_data = data[(target > 3) & (target < 7)]
good_data = data[target >= 7]

bad_mean = torch.mean(bad_data, dim=0)
mid_mean = torch.mean(mid_data, dim=0)
good_mean = torch.mean(good_data, dim=0)
for i, args in enumerate(zip(col_list, bad_mean, mid_mean, good_mean)):
    print('{:2} {:20} {:6.2f} {:6.2f} {:6.2f}'.format(i, *args))

 0 fixed acidity          7.60   6.89   6.73
 1 volatile acidity       0.33   0.28   0.27
 2 citric acid            0.34   0.34   0.33
 3 residual sugar         6.39   6.71   5.26
 4 chlorides              0.05   0.05   0.04
 5 free sulfur dioxide   53.33  35.42  34.55
 6 total sulfur dioxide 170.60 141.83 125.25
 7 density                0.99   0.99   0.99
 8 pH                     3.19   3.18   3.22
 9 sulphates              0.47   0.49   0.50
10 alcohol               10.34  10.26  11.42


In [23]:
total_sulfur_threshold = 141.83
total_sulfur_data = data[:,6]
predicted_indexes = torch.lt(total_sulfur_data, total_sulfur_threshold)
predicted_indexes.shape, predicted_indexes.dtype, predicted_indexes.sum()

(torch.Size([4898]), torch.bool, tensor(2727))

In [24]:
actual_indexes = target > 5
actual_indexes.shape, actual_indexes.dtype, actual_indexes.sum()

(torch.Size([4898]), torch.bool, tensor(3258))

In [25]:
n_matches = torch.sum(actual_indexes & predicted_indexes).item()
n_predicted = torch.sum(predicted_indexes).item()
n_actual = torch.sum(actual_indexes).item()
n_matches, n_matches / n_predicted, n_matches / n_actual

(2018, 0.74000733406674, 0.6193984039287906)

## Time series data

In [26]:
bikes_numpy = np.loadtxt("dlwpt-code/data/p1ch4/bike-sharing-dataset/hour-fixed.csv",
                        dtype=np.float32,
                        delimiter=",",
                        skiprows=1,
                        converters={1: lambda x: float(x[8:10])}) #last part converts the date string to a number corresponding to day of month

bikes = torch.from_numpy(bikes_numpy)
bikes

tensor([[1.0000e+00, 1.0000e+00,  ..., 1.3000e+01, 1.6000e+01],
        [2.0000e+00, 1.0000e+00,  ..., 3.2000e+01, 4.0000e+01],
        ...,
        [1.7378e+04, 3.1000e+01,  ..., 4.8000e+01, 6.1000e+01],
        [1.7379e+04, 3.1000e+01,  ..., 3.7000e+01, 4.9000e+01]])

In [27]:
bikes.shape, bikes.stride()

(torch.Size([17520, 17]), (17, 1))

In [28]:
#we want to make batches of 24 hours each to ensure that we look at bike usage per day
# the present data is already hourly so we can make each batch have 24 entries to accomplish this

daily_bikes = bikes.view(-1, 24, bikes.shape[1])
daily_bikes.shape, daily_bikes.stride()

(torch.Size([730, 24, 17]), (408, 17, 1))

In [29]:
daily_bikes = daily_bikes.transpose(1, 2)
daily_bikes.shape, daily_bikes.stride()

(torch.Size([730, 17, 24]), (408, 1, 17))

In [30]:
first_day = bikes[:24].long()
weather_onehot = torch.zeros(first_day.shape[0], 4)
first_day[:,9]

tensor([1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 2, 2, 2, 2])

In [31]:
weather_onehot.scatter_(
dim=1,
index=first_day[:,9].unsqueeze(1).long() - 1,
value=1.0)

tensor([[1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.]])

In [32]:
torch.cat((bikes[:24], weather_onehot), 1)[:1]

tensor([[ 1.0000,  1.0000,  1.0000,  0.0000,  1.0000,  0.0000,  0.0000,  6.0000,
          0.0000,  1.0000,  0.2400,  0.2879,  0.8100,  0.0000,  3.0000, 13.0000,
         16.0000,  1.0000,  0.0000,  0.0000,  0.0000]])

In [33]:
daily_weather_onehot = torch.zeros(daily_bikes.shape[0], 4,
daily_bikes.shape[2])
daily_weather_onehot.shape

torch.Size([730, 4, 24])

In [34]:
daily_weather_onehot.scatter_(
1, daily_bikes[:,9,:].long().unsqueeze(1) - 1, 1.0)
daily_weather_onehot.shape

torch.Size([730, 4, 24])

In [35]:
daily_bikes = torch.cat((daily_bikes, daily_weather_onehot), dim=1)

In [36]:
daily_bikes[:, 9, :] = (daily_bikes[:, 9, :] - 1.0) / 3.0 #another way to treat ordinal values by pretendfing they are continuous

In [37]:
#rescaling values
temp = daily_bikes[:, 10, :]
temp_min = torch.min(temp)
temp_max = torch.max(temp)
daily_bikes[:, 10, :] = ((daily_bikes[:, 10, :] - temp_min)
                            / (temp_max - temp_min)) #mapping in the range [0, 1]

In [38]:
'''temp = daily_bikes[:, 10, :]
daily_bikes[:, 10, :] = ((daily_bikes[:, 10, :] - torch.mean(temp))
                        / torch.std(temp))'''
# subtracting mean and dividing by std dev. another way to rescale

'temp = daily_bikes[:, 10, :]\ndaily_bikes[:, 10, :] = ((daily_bikes[:, 10, :] - torch.mean(temp))\n                        / torch.std(temp))'

## Text Data

In [39]:
with open('dlwpt-code/data/p1ch4/jane-austen/1342-0.txt', encoding='utf8') as f:
    text = f.read()

In [40]:
## since english we use only ascii to reduce the encoding size
## also we convert everything to lowercase and remove punctuation etc which aren't relevant to our scenario
lines = text.split('\n')
line = lines[800]
line

'at Mr. Darcy, “There is a fine old saying, which everybody here is of'

In [41]:
letter_t = torch.zeros(len(line), 128) #128 because that is the limit of ASCII
letter_t.shape

torch.Size([69, 128])

In [43]:
#removing non ASCII chars and one hot encoding each letter
for i, letter in enumerate(line.lower().strip()):
    letter_index = ord(letter) if ord(letter) < 128 else 0
    letter_t[i][letter_index] = 1

In [50]:
#to do the above pre-processing for words first we clean then up
def clean_words(input_str):
    punctuation = '.,;:"!?”“_-'
    word_list = input_str.lower().replace('\n',' ').split()
    word_list = [word.strip(punctuation) for word in word_list]
    return word_list

words_in_line = clean_words(line)
line, words_in_line

('at Mr. Darcy, “There is a fine old saying, which everybody here is of',
 ['at',
  'mr',
  'darcy',
  'there',
  'is',
  'a',
  'fine',
  'old',
  'saying',
  'which',
  'everybody',
  'here',
  'is',
  'of'])

In [55]:
#next we build a mapping of words to indexes. we do one hot accordin to this index
word_list = sorted(set(clean_words(text))) # for alphabetical sort in index
word2index_dict = {word: i for (i,word) in enumerate(word_list)}


In [56]:
#creating a tensor with one hot encoded words per tensor
word_t = torch.zeros(len(words_in_line), len(word2index_dict))
for i, word in enumerate(words_in_line):
    word_index = word2index_dict[word]
    word_t[i][word_index] = 1
    print('{:2} {:4} {}'.format(i, word_index, word))
print(word_t.shape)

 0  633 at
 1 4305 mr
 2 1649 darcy
 3 6487 there
 4 3696 is
 5  155 a
 6 2655 fine
 7 4546 old
 8 5716 saying
 9 7084 which
10 2384 everybody
11 3189 here
12 3696 is
13 4519 of
torch.Size([14, 7261])


## extra exercises

In [57]:
img_arr_red = imageio.imread("red.jpg")
img_arr_blue = imageio.imread("blue.jpg")
img_arr_green = imageio.imread("gree.jpg")

In [58]:
red_t = torch.from_numpy(img_arr_red)
blue_t = torch.from_numpy(img_arr_blue)
green_t = torch.from_numpy(img_arr_green)

In [59]:
##pytorch requires images to be in (channel, height, width) 
##but ours is (height, width, channel) so we shuffle
red_t = red_t.permute(2, 0, 1)
blue_t = blue_t.permute(2, 0, 1)
green_t = green_t.permute(2, 0, 1)

In [68]:
float_img = torch.Tensor.float(red_t)
torch.mean(float_img, dim=2)

tensor([[127.8870, 253.8478, 253.7304, 253.8391, 253.8348, 252.8348, 253.7826,
         252.8696, 253.8696, 253.8696, 253.8000, 253.8522, 253.8565, 253.8348,
         253.8261, 253.4087, 251.6870, 250.8739, 249.0522, 248.7913, 247.8044,
         247.4783, 247.9739, 247.0870, 242.5957, 238.0522, 238.7174, 238.1044,
         237.4652, 237.4652, 237.5087, 237.5522, 237.1696, 235.8130, 236.3739,
         235.8783, 234.8956, 235.8348, 236.8174, 236.7174, 235.5000, 234.6261,
         233.9565, 234.2000, 233.9696, 233.7826, 234.2739, 235.5000, 236.0348,
         237.1435, 237.9826, 238.5043, 239.1956, 239.3652, 239.3304, 239.4043,
         239.0348, 239.1826, 238.6956, 238.7130, 239.4130, 239.3609, 238.8087,
         238.6609, 238.5000, 237.6609, 236.9783, 237.0043, 236.9087, 236.2652,
         235.8000, 235.8174, 235.0000, 234.3609, 233.7826, 233.1348, 232.5870,
         232.1565, 231.8870, 232.0522, 232.2261, 232.0478, 231.6739, 231.2174,
         230.8044, 230.4217, 230.3174, 230.2652, 229