<a href="https://colab.research.google.com/github/shineloveyc/Deep-Learning_Exercise/blob/main/data_loading.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import numpy as np
import pandas as pd

In [2]:
import csv
wine_path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv'

#using numpy to load data
wineq_numpy  = np.loadtxt(wine_path, dtype = np.float32, delimiter=";", skiprows=1)

In [3]:
wineq_numpy

array([[ 7.  ,  0.27,  0.36, ...,  0.45,  8.8 ,  6.  ],
       [ 6.3 ,  0.3 ,  0.34, ...,  0.49,  9.5 ,  6.  ],
       [ 8.1 ,  0.28,  0.4 , ...,  0.44, 10.1 ,  6.  ],
       ...,
       [ 6.5 ,  0.24,  0.19, ...,  0.46,  9.4 ,  6.  ],
       [ 5.5 ,  0.29,  0.3 , ...,  0.38, 12.8 ,  7.  ],
       [ 6.  ,  0.21,  0.38, ...,  0.32, 11.8 ,  6.  ]], dtype=float32)

In [4]:
wineq_numpy.shape

(4898, 12)

In [5]:
#deisplay columns
wineq_df = pd.read_csv(wine_path, delimiter=';')

wineq_df.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')

In [6]:
#convert numpy to tensor
wineq_tensor = torch.from_numpy(wineq_numpy)

In [7]:
wineq_tensor.shape, wineq_tensor.type()

(torch.Size([4898, 12]), 'torch.FloatTensor')

In [8]:
#training and target variable
data = wineq_tensor[:, :-1]

data, data.shape

(tensor([[ 7.0000,  0.2700,  0.3600,  ...,  3.0000,  0.4500,  8.8000],
         [ 6.3000,  0.3000,  0.3400,  ...,  3.3000,  0.4900,  9.5000],
         [ 8.1000,  0.2800,  0.4000,  ...,  3.2600,  0.4400, 10.1000],
         ...,
         [ 6.5000,  0.2400,  0.1900,  ...,  2.9900,  0.4600,  9.4000],
         [ 5.5000,  0.2900,  0.3000,  ...,  3.3400,  0.3800, 12.8000],
         [ 6.0000,  0.2100,  0.3800,  ...,  3.2600,  0.3200, 11.8000]]),
 torch.Size([4898, 11]))

In [9]:
target = wineq_tensor[:, -1]
target, target.shape

(tensor([6., 6., 6.,  ..., 6., 7., 6.]), torch.Size([4898]))

In [10]:
#transfer the target vaiable to integer
target = wineq_tensor[:,-1].long()
target

tensor([6, 6, 6,  ..., 6, 7, 6])

In [11]:
#one hot encoding target variable
target_onehot = torch.zeros(target.shape[0], 10)

#inplace update data
#send the element of 1 to the following column index of target.unsqueeze(1) along the column of target_onehot
#becasue target_onehot has two dimension(4898,10), so use target.unsqueeze(1) to produce 2 dim tensor
target_onehot.scatter_(1, target.unsqueeze(1), 1.0)

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 1., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [12]:
target.unsqueeze(1).shape

torch.Size([4898, 1])

In [13]:
#data explore
data_mean = torch.mean(data, dim = 0)
data_mean

tensor([6.8548e+00, 2.7824e-01, 3.3419e-01, 6.3914e+00, 4.5772e-02, 3.5308e+01,
        1.3836e+02, 9.9403e-01, 3.1883e+00, 4.8985e-01, 1.0514e+01])

In [14]:
data_var = torch.var(data, dim = 0)
data_var

tensor([7.1211e-01, 1.0160e-02, 1.4646e-02, 2.5726e+01, 4.7733e-04, 2.8924e+02,
        1.8061e+03, 8.9455e-06, 2.2801e-02, 1.3025e-02, 1.5144e+00])

In [15]:
data_norimalized = (data - data_mean)/torch.sqrt(data_var)

data_norimalized

tensor([[ 1.7208e-01, -8.1761e-02,  2.1326e-01,  ..., -1.2468e+00,
         -3.4915e-01, -1.3930e+00],
        [-6.5743e-01,  2.1587e-01,  4.7996e-02,  ...,  7.3995e-01,
          1.3422e-03, -8.2419e-01],
        [ 1.4756e+00,  1.7450e-02,  5.4378e-01,  ...,  4.7505e-01,
         -4.3677e-01, -3.3663e-01],
        ...,
        [-4.2043e-01, -3.7940e-01, -1.1915e+00,  ..., -1.3130e+00,
         -2.6153e-01, -9.0545e-01],
        [-1.6054e+00,  1.1666e-01, -2.8253e-01,  ...,  1.0049e+00,
         -9.6251e-01,  1.8574e+00],
        [-1.0129e+00, -6.7703e-01,  3.7852e-01,  ...,  4.7505e-01,
         -1.4882e+00,  1.0448e+00]])

In [16]:
#use le advance index function to determine which rows correspond to a score less than or equal to 3
bad_indexes = torch.le(target, 3)
bad_indexes.shape, bad_indexes.dtype, bad_indexes.sum()

(torch.Size([4898]), torch.bool, tensor(20))

In [17]:
bad_data = data[bad_indexes]
bad_data.shape

torch.Size([20, 11])

In [18]:
#group wine quality

bad_data = data[torch.le(target, 3)]

mid_data = data[torch.gt(target, 3) & torch.lt(target, 7)]

good_data = data[torch.ge(target, 7)]

#mean of grouped data
bad_mean = torch.mean(bad_data, dim = 0)
mid_mean = torch.mean(mid_data, dim = 0)
good_mean = torch.mean(good_data, dim =0)

for i, args in enumerate(zip(wineq_df.columns, bad_mean, mid_mean, good_mean)):
  print('{:2} {:20} {:6.2f} {:6.2f} {:6.2f}'.format(i, *args))

 0 fixed acidity          7.60   6.89   6.73
 1 volatile acidity       0.33   0.28   0.27
 2 citric acid            0.34   0.34   0.33
 3 residual sugar         6.39   6.71   5.26
 4 chlorides              0.05   0.05   0.04
 5 free sulfur dioxide   53.33  35.42  34.55
 6 total sulfur dioxide 170.60 141.83 125.25
 7 density                0.99   0.99   0.99
 8 pH                     3.19   3.18   3.22
 9 sulphates              0.47   0.49   0.50
10 alcohol               10.34  10.26  11.42


In [19]:
#find bad wine
total_sulfur_threshold = 141.83
total_sulfur_data = data[:, 6]
predicated_index = torch.lt(total_sulfur_data, total_sulfur_threshold)

predicated_index.shape, predicated_index.dtype, predicated_index.sum()

(torch.Size([4898]), torch.bool, tensor(2727))

In [20]:
#get index with good wine from real target
actual_indexes = torch.gt(target, 5)

actual_indexes.shape, actual_indexes.dtype, actual_indexes.sum()

(torch.Size([4898]), torch.bool, tensor(3258))

In [21]:
#since the threashold predict less good wine compare actual target, need to find th intersection
n_matches = torch.sum(actual_indexes & predicated_index).item()

n_predicted = torch.sum(predicated_index).item()

n_actual = torch.sum(actual_indexes).item()

n_matches, n_matches/n_predicted, n_matches/n_actual

(2018, 0.74000733406674, 0.6193984039287906)

## Time Series

In [22]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
root_dir = "/content/gdrive/My Drive/"
base_dir = root_dir + 'Colab Notebooks/'

Mounted at /content/gdrive


In [23]:
from pathlib import Path
import os

In [24]:
#set up the path to read data
path = Path(base_dir + 'data/Bike-Sharing-Dataset/')
print(path)

/content/gdrive/My Drive/Colab Notebooks/data/Bike-Sharing-Dataset


In [25]:
#check current working directory
os.chdir(path)

In [26]:
bike_numpy = np.loadtxt('hour.csv', dtype=np.float32, delimiter=",", skiprows=1, converters={1:lambda x : float(x[8:10])}) # convert data strings to numbers corresponding to the day to the month in column I

bikes = torch.from_numpy(bike_numpy)

bikes

tensor([[1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 3.0000e+00, 1.3000e+01,
         1.6000e+01],
        [2.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 8.0000e+00, 3.2000e+01,
         4.0000e+01],
        [3.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 5.0000e+00, 2.7000e+01,
         3.2000e+01],
        ...,
        [1.7377e+04, 3.1000e+01, 1.0000e+00,  ..., 7.0000e+00, 8.3000e+01,
         9.0000e+01],
        [1.7378e+04, 3.1000e+01, 1.0000e+00,  ..., 1.3000e+01, 4.8000e+01,
         6.1000e+01],
        [1.7379e+04, 3.1000e+01, 1.0000e+00,  ..., 1.2000e+01, 3.7000e+01,
         4.9000e+01]])

In [27]:
bikes.shape, bikes.stride()

(torch.Size([17379, 17]), (17, 1))

In [28]:
#reshape data to have three axes(day, hour, then 17 columns attribute)
daily_bikes = bikes.view(-1, 24, bikes.shape[1])

daily_bikes.shape, daily_bikes.stride()

RuntimeError: ignored