# image data

In [1]:
import imageio
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
img_arr = imageio.imread("./dlwpt-code-master/data/p1ch4/image-dog/bobby.jpg")
img_arr.shape

  img_arr = imageio.imread("./dlwpt-code-master/data/p1ch4/image-dog/bobby.jpg")


(720, 1280, 3)

In [3]:
img = torch.from_numpy(img_arr)
out = img.permute(2,0,1)

In [4]:
out.shape

torch.Size([3, 720, 1280])

In [5]:
import os

In [6]:
batchsize = 3
batch = torch.zeros(batchsize,3,256,256,dtype = torch.uint8)
file_dir = "./dlwpt-code-master/data/p1ch4/image-cats"
files = [file for file in os.listdir(file_dir) if os.path.splitext(file)[-1] == "png"]
for i, file in enumerate(files):
    img_arr = imageio.imread(os.path.join(file_dir,file))
    img = torch.from_numpy(img_arr)
    img.permute(2,0,1)
    img = img[:3]
    batch[i] = img

In [7]:
batch = batch.float()
n_channels = batch.shape[1]
for i in range(n_channels):
    mean = torch.mean(batch[:,i])
    std = torch.std(batch[:,i])
    batch[:,i] = (batch[:,i]-mean)/std

In [8]:
dir_path = "./dlwpt-code-master/data/p1ch4/volumetric-dicom/2-LUNG 3.0  B70f-04083"
file = imageio.volread(dir_path,'DICOM')
vol = torch.from_numpy(file).float()

Reading DICOM (examining files): 1/99 files (1.0%23/99 files (23.2%49/99 files (49.5%74/99 files (74.7%99/99 files (100.0%)
  Found 1 correct series.
Reading DICOM (loading data): 95/99  (96.099/99  (100.0%)


In [9]:
vol = torch.unsqueeze(vol,0)
vol.shape

torch.Size([1, 99, 512, 512])

# tabular data

In [10]:
import pandas as pd
import numpy as np

In [11]:
df = pd.read_csv("./dlwpt-code-master/data/p1ch4/tabular-wine/winequality-white.csv",delimiter=";",dtype=float)
colist = df.columns
print(colist)
wineq_numpy = df.to_numpy()

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')


In [12]:
wineq_numpy=wineq_numpy.astype(np.float32)
wineq_numpy.shape

(4898, 12)

In [13]:
wineq = torch.from_numpy(wineq_numpy)

In [14]:
data = wineq[:,:-1]
target = wineq[:,-1].long()

In [15]:
target.shape

torch.Size([4898])

In [16]:
target_onehot = torch.zeros(target.shape[0],10)
target_onehot.scatter_(1,target.unsqueeze(1),1.0)
target_onehot

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 1., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [17]:
data_mean = torch.mean(data,dim = 0)
data_var = torch.var(data,dim = 0)

In [18]:
data_normalized = (data-data_mean)/torch.sqrt(data_var)

In [19]:
bad_indexs = target<=3
bad_data = data[bad_indexs]
bad_data.shape

torch.Size([20, 11])

In [20]:
bad_data = data[target<=3]
mid_data = data[(target>3) & (target<=7)]
good_data = data[target>7]
bad_mean = torch.mean(bad_data,dim = 0)
mid_mean = torch.mean(mid_data,dim = 0)
good_mean = torch.mean(good_data,dim =0)
for i, args in enumerate(zip(colist,bad_mean,mid_mean,good_mean)):
    print('{:2} {:20} {:6.2f} {:6.2f} {:6.2f}'.format(i,*args))


 0 fixed acidity          7.60   6.86   6.68
 1 volatile acidity       0.33   0.28   0.28
 2 citric acid            0.34   0.33   0.33
 3 residual sugar         6.39   6.42   5.63
 4 chlorides              0.05   0.05   0.04
 5 free sulfur dioxide   53.33  35.18  36.63
 6 total sulfur dioxide 170.60 138.70 125.88
 7 density                0.99   0.99   0.99
 8 pH                     3.19   3.19   3.22
 9 sulphates              0.47   0.49   0.49
10 alcohol               10.34  10.47  11.65


In [21]:
total_sulfur_threshold = 138.70
total_sulfur_data = data[:,6]
predicted_indexes = torch.lt(total_sulfur_data,total_sulfur_threshold)
predicted_indexes.shape,predicted_indexes.dtype,predicted_indexes.sum()

(torch.Size([4898]), torch.bool, tensor(2618))

In [22]:
actual_indexes = target>5
actual_indexes.shape,actual_indexes.dtype,actual_indexes.sum()

(torch.Size([4898]), torch.bool, tensor(3258))

In [23]:
n_matches = torch.sum(actual_indexes&predicted_indexes).item()
n_predicted = predicted_indexes.sum().item()
n_actual = actual_indexes.sum().item()
n_matches, n_matches/n_predicted, n_matches/n_actual

(1935, 0.7391138273491215, 0.5939226519337016)

# time series 

In [24]:
df = pd.read_csv("./dlwpt-code-master/data/p1ch4/bike-sharing-dataset/hour-fixed.csv",converters={1:lambda x: float(x[8:10])})
df.to_numpy()
bikes = torch.from_numpy(df.to_numpy().astype(np.float32))
bikes.shape,bikes.stride()

(torch.Size([17520, 17]), (1, 17520))

In [25]:
daily_bikes = bikes.view(-1,24,bikes.shape[1])
daily_bikes.shape, daily_bikes.stride()

(torch.Size([730, 24, 17]), (24, 1, 17520))

In [26]:
daily_bikes = daily_bikes.transpose(1,2)
daily_bikes.shape,daily_bikes.stride()

(torch.Size([730, 17, 24]), (24, 17520, 1))

In [27]:
first_day = bikes[:24].long()
weather_onehot = torch.zeros(first_day.shape[0],4)

In [28]:
first_day[:,9]
weather_onehot.scatter_(1,first_day[:,9].unsqueeze(1).long()-1,1.0)

tensor([[1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.]])

In [29]:
torch.cat((bikes[:24],weather_onehot),1)[:1]

tensor([[ 1.0000,  1.0000,  1.0000,  0.0000,  1.0000,  0.0000,  0.0000,  6.0000,
          0.0000,  1.0000,  0.2400,  0.2879,  0.8100,  0.0000,  3.0000, 13.0000,
         16.0000,  1.0000,  0.0000,  0.0000,  0.0000]])

In [30]:
daily_weather_onehot = torch.zeros(daily_bikes.shape[0],4,daily_bikes.shape[2])

In [31]:
daily_weather_onehot.scatter_(1,daily_bikes[:,9,:].unsqueeze(1).long()-1,1.0)
daily_weather_onehot.shape

torch.Size([730, 4, 24])

In [32]:
daily_bikes = torch.cat((daily_bikes,daily_weather_onehot),1)

In [33]:
temp = daily_bikes[:,10,:]
daily_bikes[:,10,:] = (daily_bikes[:,10,:]-torch.mean(daily_bikes[:,10,:]))/torch.std(daily_bikes[:,10,:])

# text data

In [34]:
with open("./dlwpt-code-master/data/p1ch4/jane-austen/1342-0.txt",encoding="utf8") as f:
    text = f.read()

In [35]:
lines = text.split("\n")
line = lines[200]
line

'“Impossible, Mr. Bennet, impossible, when I am not acquainted with him'

In [36]:
letter_t = torch.zeros(len(line),128)
letter_t.shape

torch.Size([70, 128])

In [37]:
for i, letter in enumerate(line.lower().strip()):
    letter_index = ord(letter) if ord(letter)<128 else 0
    letter_t[i][letter_index] = 1

In [38]:
def clean_words(input_str):
    punctuation =  ".,;:!?\"_-“”"
    word_list = input_str.lower().replace('\n',' ').split()
    word_list = [word.strip(punctuation) for word in word_list]
    return word_list

In [39]:
words_in_line = clean_words(line)
line,words_in_line

('“Impossible, Mr. Bennet, impossible, when I am not acquainted with him',
 ['impossible',
  'mr',
  'bennet',
  'impossible',
  'when',
  'i',
  'am',
  'not',
  'acquainted',
  'with',
  'him'])

In [40]:
word_list = sorted(set(clean_words(text)))
word2index_dict = {word:i for (i,word) in enumerate(word_list)}
len(word2index_dict), word2index_dict['impossible']

(7261, 3394)

In [41]:
word_t = torch.zeros(len(words_in_line),len(word2index_dict))
for i,word in enumerate(words_in_line):
    word_index = word2index_dict[word]
    word_t[i][word_index] = 1
    print('{:2} {:4} {}'.format(i,word_index,word))
print(word_t.shape)

 0 3394 impossible
 1 4305 mr
 2  813 bennet
 3 3394 impossible
 4 7078 when
 5 3315 i
 6  415 am
 7 4436 not
 8  239 acquainted
 9 7148 with
10 3215 him
torch.Size([11, 7261])
