In [None]:
import numpy as np
import pandas as pd
import time
import pickle
import os
import matplotlib.pyplot as plt
from impala.dbapi import connect

from sklearn.cluster import KMeans
from sklearn.metrics import roc_auc_score

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import Dataset, DataLoader, ConcatDataset

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
seed = 55

In [None]:
with open('data_tensor.pickle', 'rb') as pp:
    df = pickle.load(pp)

TRAIN_BATCH_SIZE = 3000
VALID_BATCH_SIZE = 9999
validation_split = 0.9

split = int(df.shape[0]*validation_split)
split_idx = list(range(df.shape[0]))

np.random.seed(seed)
np.random.shuffle(split_idx)

train_idx, valid_idx = split_idx[:split], split_idx[split:]
print('train vs val:', len(train_idx), len(valid_idx))

In [None]:
class TX_Dataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return(len(self.df))
    
    def __getitem__(self, idx):
        x = self.df[idx]
        return x

In [None]:
dataset_train = TX_Dataset(df[train_idx])
dataset_valid = TX_Dataset(df[valid_idx])

train_loader = DataLoader(dataset_train, 
                          batch_size=TRAIN_BATCH_SIZE, 
                          num_workers=1,
                          pin_memory=True)

valid_loader = DataLoader(dataset_valid, 
                          batch_size=VALID_BATCH_SIZE,
                          num_workers=1,
                          pin_memory=True)

print('length of loaders:', len(train_loader), len(valid_loader))

In [None]:
class EncoDeco(nn.Module):
    def __init__(self):
        super(EncoDeco, self).__init__()
        self.input_size = 464 
        self.linear_01 = 150
#         self.linear_02 = 50
        self.linear_out = 30
        
        self.enco_lin_01 = nn.Linear(self.input_size, self.linear_01)
#         self.enco_lin_02 = nn.Linear(self.linear_01, self.linear_02)
        self.enco_lin_03 = nn.Linear(self.linear_01, self.linear_out)        
        
        self.deco_lin_01 = nn.Linear(self.linear_out, self.linear_01)
#         self.deco_lin_02 = nn.Linear(self.linear_02, self.linear_01)
        self.deco_lin_03 = nn.Linear(self.linear_01, self.input_size)        
        
    def encoder(self, x):
        x = self.enco_lin_01(x)
        x = F.relu(x)
#         x = self.enco_lin_02(x)
#         x = F.relu(x)
        x = self.enco_lin_03(x)
        return x
    
    def decoder(self, x):
        x = self.deco_lin_01(x)
        x = F.relu(x)
#         x = self.deco_lin_02(x)
#         x = F.relu(x)
        x = self.deco_lin_03(x)
        x = F.relu(x)
        return x
        
    def forward(self, x):
        x = self.encoder(x)
        x_gen = self.decoder(x)
        return F.log_softmax(x_gen, -1)


def init_weights(m):
    if type(m) == nn.Linear:
        torch.nn.init.kaiming_normal_(m.weight)
        m.bias.data.fill_(0.01)

In [None]:
endn_dense = EncoDeco()
endn_dense = endn_dense.to(device)
endn_dense.apply(init_weights)

LEARNING_RATE = 0.001
optimizer = optim.Adam(endn_dense.parameters(), lr = LEARNING_RATE)

loss_func = nn.KLDivLoss(reduction='batchmean')
# loss_func = nn.L1Loss()

beg = time.time()
NUM_EPOCH = 2000

train_loss_plot = []
valid_loss_plot = []
endn_dense.train()
for epoch in range(NUM_EPOCH):

    train_losses = 0
    valid_losses = 0
    for x_tr in train_loader:
        x_tr = x_tr.to(device)
        
        optimizer.zero_grad()
        
        gen_x = endn_dense(x_tr)
        loss = loss_func(gen_x, x_tr)

        loss.backward()

#         torch.nn.utils.clip_grad_norm_(rnn_extractor.parameters(), 5)
        optimizer.step()

        train_losses += loss.item()

    with torch.no_grad():
        
        endn_dense.eval()
        for x in valid_loader:

            x = x.to(device)
            gen_x_val = endn_dense(x)

            val_loss = loss_func(gen_x_val, x)
            valid_losses += val_loss.item()

    train_loss_plot.append(train_losses/len(train_loader))
    valid_loss_plot.append(valid_losses/len(valid_loader))
    
    print('Epoch:', epoch, 
          'Train Loss: {:.8f}'.format(train_losses/len(train_loader)),
          'Valid Loss: {:.8f}'.format(valid_losses/len(valid_loader)))
    
tookme = time.time()    