# Data Preprocessing

Create a CSV file below

In [22]:
import os

os.makedirs(os.path.join('..', 'data'), exist_ok=True)
data_file = os.path.join('..', 'data', 'house_tiny.csv')
with open(data_file, 'w') as f:
    f.write('''NumRooms,RoofType,Price
NA,NA,127500
2,NA,106000
4,Slate,178100
NA,NA,140000''')

import pandas as pd

data = pd.read_csv(data_file)
data

Unnamed: 0,NumRooms,RoofType,Price
0,,,127500
1,2.0,,106000
2,4.0,Slate,178100
3,,,140000


In [23]:
inputs, outputs = data.iloc[:, 0:2], data.iloc[:, 2]

For categorical input fields, 
we can treat `NaN` as a category, 
iloc means index location

In [32]:
inputs, targets = data.iloc[:, 0:2], data.iloc[:, 2]
inputs = pd.get_dummies(inputs, dummy_na=True, dtype=int)
inputs

Unnamed: 0,NumRooms,RoofType_Slate,RoofType_nan
0,,0,1
1,2.0,0,1
2,4.0,1,0
3,,0,1


Replace the `NaN` entries with 
the mean value of the corresponding column

In [33]:
inputs = inputs.fillna(inputs.mean())
print(inputs)

   NumRooms  RoofType_Slate  RoofType_nan
0       3.0               0             1
1       2.0               0             1
2       4.0               1             0
3       3.0               0             1


All the entries in `inputs` and `targets` are numerical,
we can load them into a tensor

In [43]:
import torch
X = torch.tensor(inputs.values, dtype=torch.float16)
y = torch.tensor(outputs.values, dtype=torch.float32)
X, y

(tensor([[3., 0., 1.],
         [2., 0., 1.],
         [4., 1., 0.],
         [3., 0., 1.]], dtype=torch.float16),
 tensor([127500., 106000., 178100., 140000.]))

In [5]:
import torch

X = torch.tensor(inputs.to_numpy(dtype=float))
y = torch.tensor(targets.to_numpy(dtype=float))
X, y

(tensor([[3., 0., 1.],
         [2., 0., 1.],
         [4., 1., 0.],
         [3., 0., 1.]], dtype=torch.float64),
 tensor([127500., 106000., 178100., 140000.], dtype=torch.float64))

In [48]:
a = torch.arange(12)
b = a.reshape((3, 4))
b[:] = 3
a

tensor([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3])