## Adam optimizer  - example 

* Data : california-housing-prices.csv
* The data pertains to the houses found in a given California district and some summary stats about them based on the 1990 census data. 

* reference : https://www.kaggle.com/camnugent/california-housing-prices?select=housing.csv

### Data load

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler

In [None]:
data = pd.read_csv('../input/california-housing-prices/housing.csv')

In [None]:
data.head()

In [None]:
data.info()

### missing values 


```
(1) data.info()를 통해, total_bedrooms의 데이터가 결측됨을 확인할 수 있다.

(2) 결측된 데이터 처리 방법
  -  결측된 데이터가 너무 많은 경우 해당 열 전체 삭제
  -  결측된 데이터가 일부일 경우 그럴듯한 값으로 대체하기
```



 - missingno를 사용하여 결측값 위치 시각화로 확인


In [None]:
# 결측값 흰색으로 비어있음 
import missingno as msno
import matplotlib.pyplot as plt
%matplotlib inline

msno.matrix(data)
plt.show()

In [None]:
# 1. 결측된 데이터가 너무 많은 경우 해당 열 전체 삭제
data1 = data.dropna()

msno.matrix(data1)
plt.show()

In [None]:

# 2. 결측된 데이터가 일부일 경우 그럴듯한 값으로 대체하기
# sklearn.SimpleImputer(mean, median, most_frequent)

from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy  = 'most_frequent')
data = pd.DataFrame(imputer.fit_transform(data))
data

In [None]:
msno.matrix(data)
plt.show()

### LabelEncoder

In [None]:
x = data.iloc[:,:-1].values
y = data.iloc[:,-1:].values

y

In [None]:
data.columns= [['longitude','latitude','housing_median_age','total_rooms','total_bedrooms','population','households','median_income','median_house_value','ocean_proximity']]

In [None]:
data.head()

In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoder.fit(y)
labels = encoder.transform(y)
data = data.drop('ocean_proximity', axis=1)
data['ocean'] = labels

In [None]:
data.tail()
data.values[:,:-1]

### Data preprocessing

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler() 
scaler.fit(data.values[:,:-1])
df = scaler.transform(data.values[:,:-1])

In [None]:
df = pd.DataFrame(df)
df['9'] = labels
df.head()

### To Model use Pytorch

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [None]:
data = torch.from_numpy(df.values).float()

data.shape

In [None]:
x = data[:, :-1]
y = data[:, -1:]

print(x.shape, y.shape)
print(x.size(0))

In [None]:
n_epochs = 4000
batch_size = 256
print_interval = 200
#learning_rate = 1e-2

In [None]:
# Build Model

model = nn.Sequential(
    nn.Linear(x.size(-1), 6), # 8->6
    nn.LeakyReLU(),
    nn.Linear(6, 5),
    nn.LeakyReLU(),
    nn.Linear(5, 4),
    nn.LeakyReLU(),
    nn.Linear(4, 3),
    nn.LeakyReLU(),
    nn.Linear(3, y.size(-1)),

)

model

optimizer = optim.Adam(model.parameters())

In [None]:
# |x| = (total_size, input_dim)
# |y| = (total_size, output_dim)


for i in range(n_epochs) :
  # shuffle the index to feed-forward.
  # 20640개 데이터 셔플링해서 랜덤하게 새로 인덱스 설정해주기 
  indices = torch.randperm(x.size(0)) #x.size(0) = 20640
  x_ = torch.index_select(x, dim=0, index=indices)
  y_ = torch.index_select(y, dim=0, index=indices)

  x_ = x_.split(batch_size, dim=0)
  y_ = y_.split(batch_size, dim=0)
  # |x_[i]| = (batch_size, input_dim)
  # |y_[i]| = (batch_size, output_dim)

  y_hat = []
  total_loss = 0

  for x_i, y_i in zip(x_, y_):
    y_hat_i = model(x_i)
    loss = F.mse_loss(y_hat_i, y_i)

    optimizer.zero_grad()
    loss.backward()

    optimizer.step()

    total_loss += float(loss) #this is very important to prevent memory leark.
    y_hat += [y_hat_i]

  total_loss = total_loss / len(x_)
  if (i + 1) % print_interval == 0:
    print('Epoch %d : loss=%.4e' % (i+1, total_loss))

y_hat = torch.cat(y_hat, dim=0)
y = torch.cat(y_, dim=0)



### Let's see the result!

In [None]:
df = pd.DataFrame(torch.cat([y, y_hat], dim=1).detach().numpy(),
                  columns=["y", "y_hat"])

sns.pairplot(df, height=4)
plt.show()

# 5가 잘 예측되지 않음을 확인할 수 있다.

### reference

* Missing value : https://continuous-development.tistory.com/165

* LabelEncoder : https://nicola-ml.tistory.com/62