In [1]:
import torch
import pandas as pd
import sklearn
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from tqdm import tqdm
from torch.utils.data import Dataset,DataLoader

from models.ViT import ViT_LRP_copy

In [2]:
X_datapaths = ['./preprocessed/prepared/nan/L2Y1.pkl','./preprocessed/prepared/nan/L2Y2.pkl','./preprocessed/prepared/nan/L2Y3.pkl','./preprocessed/prepared/nan/L2Y4.pkl','./preprocessed/prepared/nan/L2Y5.pkl','./preprocessed/prepared/nan/L2Y6.pkl',]
label_datapath = './preprocessed/prepared/nan/label.pkl'

In [3]:
# read pickle
input_datas = []
for datapath in X_datapaths:
    temp = pd.read_pickle(datapath)
    temp = temp.reset_index()
    input_datas.append(temp)
label_data = pd.read_pickle(label_datapath)
label_data = label_data.reset_index()



seq_len = len(input_datas)

In [4]:
label_data = label_data - 1

In [5]:
CLS2IDX = {
    0 : '1등급',
    1 : '2등급',
    2 : '3등급',
    3 : '4등급',
    4 : '5등급',
    5 : '6등급',
    6 : '7등급',
    7 : '8등급',
    8 : '9등급'
}

In [6]:
# testlist = []
# for data in input_datas:
#     #print(data.shape)
#     #data.iloc[:,0]
#     testlist.append(data.iloc[:,1])

In [7]:
# test = pd.concat(testlist,axis=1)
# t5 = test.iloc[:,5].isna().to_numpy()

# test_t5 = test.iloc[t5]
# t4 = test_t5.iloc[:,4].isna().to_numpy()
# test_t4 = test_t5.iloc[t4]
# t3 = test_t4.iloc[:,3].isna().to_numpy()
# test_t3 = test_t4.iloc[t3]
# test_t3

In [8]:
class Embedding(nn.Module):
    def __init__(self,in_features,hidden_features=None,out_features=None,act_layer=nn.ReLU,dropout=0.1):
        super().__init__()
        out_features = out_features or in_features
        self.fc1=nn.Linear(in_features,hidden_features)
        self.act = act_layer()
        self.dropout = nn.Dropout(dropout)
        self.fc2 = nn.Linear(hidden_features, out_features)

    def forward(self,x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.fc2(x)
        return x

In [9]:
def batch_to_embbedings(datas,networks):
    """
    batches : years of data. seperated outputs of dataloader. each element of datas has different feature size.
    network : embedding linear networks that matches feature size of each data
    return list of embbeding and boolian matrix of nan

    """
    emb_list = []
    emb_nonnan_list = []
    for i,net in enumerate(networks):
        emb = net(datas[i])
        emb_list.append(emb)
        
    
    return emb_list

        

In [10]:
def make_splited_data(input_datas,label_data,is_regression=False):

    def apply_scaler(datain,scaler):
        
        fitted = scaler.fit(datain)
        output = scaler.transform(datain)
        output = pd.DataFrame(output,columns = datain.columns, index=list(datain.index.values))
        return output

    tup = train_test_split(input_datas[0],input_datas[1],input_datas[2],input_datas[3],input_datas[4],input_datas[5],label_data,train_size=0.8)
    #input data에 따라 이쁘게 할 수 없나..
    X_trains = []
    X_tests = []
    for i in range(len(input_datas)):
        X_trains.append(tup[2*i].reset_index())
        X_tests.append(tup[2*i+1].reset_index())
    y_trains = [tup[-2].reset_index()]
    y_tests = [tup[-1].reset_index()]
    

    for datas in X_trains, X_tests:
        for i,data in enumerate(datas):
            datas[i] = data.drop(columns=['level_0','index'])
            min_max_scaler = MinMaxScaler()
            datas[i] = apply_scaler(datas[i],min_max_scaler)

    for datas in y_trains, y_tests:
        for i,data in enumerate(datas):
            datas[i] = data.drop(columns=['level_0','index'])
            if is_regression == True:
                min_max_scaler = MinMaxScaler()
                datas[i] = apply_scaler(datas[i],min_max_scaler)

    return X_trains, X_tests, y_trains[0], y_tests[0] # return list of sequences and a label


In [11]:
is_regression = False
X_trains, X_tests, y_train, y_test = make_splited_data(input_datas,label_data,is_regression=is_regression)

In [12]:
def make_split_list(year_datas):
    """make split list used for spliting batches. batches must be splitted with torch.tensor_split with split_list"""
    split_list = []
    split = 0
    for data in year_datas:
        split += data.shape[1]
        split_list.append(split)
    split_list.pop() # 
    return split_list



In [13]:
def batch_to_splited_datas(batch,split_list):
    list = torch.tensor_split(batch,split_list,dim=1)
    return list

In [14]:
class KELSDataSet(Dataset):
    def __init__(self,year_datas,label,is_regression=False):
        
        for i,data in enumerate(year_datas):
            year_datas[i] = data.to_numpy()
        self.split_list = make_split_list(year_datas) # used after getitem of dataloader.
        self.is_regression = is_regression
        self.label = label.to_numpy()
        self.seq_len = len(year_datas)
        self.data_len = year_datas[0].shape[0]
        self.data = np.concatenate(year_datas,axis=1)


    def __len__(self):
        return self.data_len

    def __getitem__(self,idx):

        x = torch.FloatTensor(self.data[idx])
        if self.is_regression == True:
            y_E,y_K,y_M  = torch.FloatTensor(self.label[idx])[0],torch.FloadTensor(self.label[idx])[1],torch.FloadTensor(self.label[idx])[2]
        else:
            y_E,y_K,y_M = torch.LongTensor(self.label[idx])[0],torch.LongTensor(self.label[idx])[1],torch.LongTensor(self.label[idx])[2]

        return (x,(y_E,y_K,y_M))
        


In [15]:
train_dataset = KELSDataSet(X_trains,y_train,is_regression=is_regression)
test_dataset = KELSDataSet(X_tests,y_test,is_regression=is_regression)


In [16]:
batch_size = 32
hidden_features = 100

In [17]:
embbed_dim = 72

In [18]:
def make_embbeding_networks(train_dataset,batch_size=None,hidden_features = 100, out_features = 72, dropout=0.1):
    """make embedding networks based on train_dataset. batch size must be same with dataloaders."""
    split_list = train_dataset.split_list
    sample_loader = DataLoader(train_dataset,batch_size=batch_size,shuffle=True)
    assert batch_size
    (sample,label) = next(iter(sample_loader))
    sample_datas = batch_to_splited_datas(sample,split_list)
    embbeding_networks = [] # embbeding networks : 총 6개의 인코딩 네트워크. 흠.. nan 들어오면 batch x feature 사이즈의 nan true false 내놔야..?
    # batch x seq 의 nanlist도 필요..
    
    for sample_data in sample_datas:
        in_features = sample_data.shape[1]
        emb_net = Embedding(in_features,hidden_features=hidden_features,out_features=out_features,dropout=dropout)
        embbeding_networks.append(emb_net)
    return embbeding_networks

    

In [19]:
embbeding_networks = make_embbeding_networks(train_dataset,batch_size=batch_size,hidden_features = hidden_features,out_features=embbed_dim)

In [20]:
train_loader = DataLoader(train_dataset,batch_size=batch_size,shuffle=True)
test_loader = DataLoader(test_dataset,batch_size=batch_size,shuffle=False)
split_list = train_dataset.split_list

In [33]:


#def train_net
model_E = ViT_LRP_copy.VisionTransformer(seq_len=6, num_classes=9, embed_dim=72, depth=8,
                 num_heads=6, mlp_ratio=4., qkv_bias=False, mlp_head=False, drop_rate=0.1, attn_drop_rate=0.1)
model_K = ViT_LRP_copy.VisionTransformer(seq_len=6, num_classes=9, embed_dim=72, depth=8,
                 num_heads=6, mlp_ratio=4., qkv_bias=False, mlp_head=False, drop_rate=0.1, attn_drop_rate=0.1)
model_M = ViT_LRP_copy.VisionTransformer(seq_len=6, num_classes=9, embed_dim=72, depth=8,
                 num_heads=6, mlp_ratio=4., qkv_bias=False, mlp_head=False, drop_rate=0.1, attn_drop_rate=0.1)                 
for concated_data,(label_E,label_K,label_M) in train_loader:
    datas = batch_to_splited_datas(concated_data,split_list)
    emb_batch_list= batch_to_embbedings(datas,embbeding_networks) # can be used for contrastive loss
    # emb_batch_list : 임베딩 벡터들의 리스트. 얘를 이제 batch x seq x feature 행렬로 쌓음
    emb_batched_seq = torch.stack(emb_batch_list).transpose(0,1)
    attn_mask = make_attn_mask(emb_batched_seq)
    (E_score,K_score,M_score) = model_E(emb_batched_seq,attn_mask),model_K(emb_batched_seq,attn_mask),model_M(emb_batched_seq,attn_mask)
    

In [35]:
criterion = nn.CrossEntropyLoss()

loss_E = criterion(E_score,label_E)
loss_K = criterion(K_score,label_K)
loss_M = criterion(M_score,label_M)


In [28]:
emb_batched_seq.shape

torch.Size([14, 6, 72])

In [32]:
def make_attn_mask(emb_seq_batch):
    """
    make attention mask from embedding batch. 
    batch = (batch, seq_len,embedding_size)
    """
    batch_size = emb_seq_batch.shape[0]
    emb_seq_batch_isnan = torch.isnan(emb_seq_batch)
    emb_seq_batch[emb_seq_batch_isnan] = 0


    attn_mask = emb_seq_batch_isnan[:,:,0]
    temp = torch.BoolTensor(batch_size)

    temp[:] = False
    attn_mask = torch.concat((temp.unsqueeze(1),attn_mask),dim=1)

    attn_mask = attn_mask.unsqueeze(1).expand(-1,seq_len+1,-1)
    attn_mask = attn_mask.unsqueeze(1)
    return attn_mask

ValueError: Found input variables with inconsistent numbers of samples: [6, 5218]

In [None]:


#make them into Dataloader, __getitem__ should return list of (Batch, feature_size ). 리스트 안 개수 : 6개.
# 

# define 6 of embbeding network. each network must be built
#
# select a one batch sample.
# for i,sample in enumarate sample_list : 
#       emb_net[i] = MLP(in_feature,hidden_feature,out_feature. there must be dropout.)
# 

# make nanlist for batch. (B,nanlist) 꼴. nanlist는 sequence_length와 매치
# after embedding, input should be (Batch,Sequence_length,feature size)

In [None]:
#defince training and evaluation.