In [None]:
import os
# import Google Drive 套件
from google.colab import drive
# 將自己的雲端硬碟掛載上去
drive.mount('/content/gdrive')

os.chdir('./gdrive/MyDrive/Colab Notebooks/cheng_ta/final/dataset')      # 檔案目錄

In [1]:
import torch
import torch.nn as nn
from torch import optim
import numpy as np
import pandas as pd
import os
import pickle
import random
import time
import importlib
import sys
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
import predictModels as pm

In [2]:
with open('./dataset/datas.pickle', 'rb') as f:
    datas = pickle.load(f)

In [3]:
# graph labels
classes = []
for i in range(len(datas)):
    if datas[i]['graphLabel'] not in classes:
        classes.append(datas[i]['graphLabel'])
        
Gclasses = []
for i in range(len(datas)):
    Gclasses.append(datas[i]['graphLabel'])
Gclasses = torch.tensor(Gclasses, dtype=torch.long)
classes, Gclasses.shape

([0, 1], torch.Size([2000]))

In [15]:
# node labels
Nclasses = []
for i in range(len(datas)):
    Nclasses.append(datas[i]['nodesLabel'])
Nclasses = np.concatenate(Nclasses, axis=0)
Nclasses = torch.tensor(Nclasses, dtype=torch.long)

classWeight = [0] * 38
for i in range(Nclasses.shape[0]):
    c = Nclasses[i]
    classWeight[c] += 1
classWeight = torch.tensor(classWeight, dtype=torch.float)
classWeight = classWeight / torch.sum(classWeight)
classWeight = 1 / classWeight
classWeight = classWeight / torch.sum(classWeight)

Nclasses.shape, torch.max(Nclasses)

(torch.Size([31385]), tensor(37))

# graph 1 train

In [4]:
with open('./Attributes/graphAttributes_1.pickle', 'rb') as f:
    graphAttributes_1 = pickle.load(f)
graphAttributes_1.shape

torch.Size([2000, 36])

In [59]:
# model 參數
inputD = 36
hD = 18
outputD = 2

# 訓練超參數 train
modelSave = './models/pre1/graphModel_1.pt'
lossSave = './models/pre1/graphLoss_1.pickle'
size_batch = 16
epochs = 200
lr = 0.01
weight_decay = 0
shuffle_data = True
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [60]:
model = pm.predModel_1(inputD, hD, outputD)

In [72]:
def train_graphAttributes(model,
                          graphAttributes_1,
                          labels,
                          size_batch: int = 500,
                          epochs: int = 100,
                          lr: float = 1e-1,
                          weight_decay: float = 0,
                          shuffle_data: bool = True,
                          device=None):
    """
    training a FGWF model
    Args:
        model: a FGWF model
        database: a list of data, each element is a list representing [cost, distriubtion, feature, label]
        size_batch: the size of batch, deciding the frequency of backpropagation
        epochs: the number epochs
        lr: learning rate
        weight_decay: the weight of the l2-norm regularization of parameters
        shuffle_data: whether shuffle data in each epoch
        zeta: the weight of the regularizer enhancing the diversity of atoms
        mode: fit or transform
        visualize_prefix: display learning result after each epoch or not
    """
    global modelSave, lossSave
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    lossfun = nn.CrossEntropyLoss()
    
    if device is not None:
        print(device)
        model.to(device)
        graphAttributes_1 = graphAttributes_1.to(device)
        labels = labels.to(device)
    
    model.train()

    num_samples = graphAttributes_1.shape[0]
    index_samples = list(range(num_samples))
    index_split = int(num_samples * 0.8)
    
    random.shuffle(index_samples)
    index_train = index_samples[:index_split]
    index_val = index_samples[index_split:]
    
    loops = int(np.ceil(len(index_train) / size_batch))
    epoch_metric = {'loss':[], 'val':[]}
    
    t_start = time.time()
    
    for epoch in range(epochs):
        batch_loss = []
        optimizer.zero_grad()

        if shuffle_data:
            random.shuffle(index_train)
        
        for loop in range(loops):
            indexes = index_train[loop * size_batch:loop * size_batch + size_batch]
            x = graphAttributes_1[indexes, :]
            y = model(x)
            l = labels[indexes]
            loss = lossfun(y, l)
            batch_loss.append(loss.item())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        el = sum(batch_loss) / len(batch_loss)
        epoch_metric['loss'].append(el)
        with torch.no_grad():
            x = graphAttributes_1[index_val, :]
            y = model(x)
            pred = torch.argmax(y, dim=1).cpu().detach()
            true = labels[index_val].cpu().detach()
            accu = accuracy_score(pred, true)
        epoch_metric['val'].append(accu)
        print('epoch loss: {},   val_accu:{},   epoch:{}/{},   time:{}'.format(el, accu, epoch, epochs, time.time()-t_start))
        
        with open(lossSave, 'wb') as f:
              pickle.dump(epoch_metric, f)
        torch.save(model.state_dict(), modelSave)

In [99]:
train_graphAttributes(model=model,
                      graphAttributes_1=graphAttributes_1,
                      labels=Gclasses,
                      size_batch=size_batch,
                      epochs=epochs,
                      lr=lr,
                      weight_decay=weight_decay,
                      shuffle_data=shuffle_data,
                      device=device
                      )

cuda
epoch loss: 0.3565731942653656,   val_accu:0.97,   epoch:0/100,   time:0.007979393005371094


In [100]:
with open('./models/pre1/graphLoss_1.pickle', 'rb') as f:
    graphLoss_1 = pickle.load(f)

In [101]:
graphLoss_1

{'loss': [0.3565731942653656], 'val': [0.97]}

# node 1 train

In [125]:
with open('./Attributes/nodeAttributes_1.pickle', 'rb') as f:
    nodeAttributes_1 = pickle.load(f)
nodeAttributes_1 = torch.cat(nodeAttributes_1, dim=0)
nodeAttributes_1.shape

torch.Size([31385, 5040])

In [None]:
# model 參數
inputD = 5040
hD = 2500
outputD = 38

# 訓練超參數 train
modelSave = './models/pre1/nodeModel_1.pt'
lossSave = './models/pre1/nodeLoss_1.pickle'
size_batch = 500
epochs = 100
lr = 0.01
weight_decay = 0
shuffle_data = True
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
model = pm.predModel_1(inputD, hD, outputD)

In [None]:
def train_nodeAttributes(model,
                        nodeAttributes_1,
                        labels,
                        size_batch: int = 500,
                        epochs: int = 100,
                        lr: float = 1e-1,
                        weight_decay: float = 0,
                        shuffle_data: bool = True,
                        device=None):
    """
    training a FGWF model
    Args:
        model: a FGWF model
        database: a list of data, each element is a list representing [cost, distriubtion, feature, label]
        size_batch: the size of batch, deciding the frequency of backpropagation
        epochs: the number epochs
        lr: learning rate
        weight_decay: the weight of the l2-norm regularization of parameters
        shuffle_data: whether shuffle data in each epoch
        zeta: the weight of the regularizer enhancing the diversity of atoms
        mode: fit or transform
        visualize_prefix: display learning result after each epoch or not
    """
    global modelSave, lossSave, classWeight
    
    if device is not None:
        print(device)
        model.to(device)
        nodeAttributes_1 = nodeAttributes_1.to(device)
        labels = labels.to(device)
        classWeight = classWeight.to(device)
    
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    lossfun = nn.CrossEntropyLoss(weight=classWeight)
    
    model.train()

    num_samples = nodeAttributes_1.shape[0]
    index_samples = list(range(num_samples))
    index_split = int(num_samples * 0.8)
    
    random.shuffle(index_samples)
    index_train = index_samples[:index_split]
    index_val = index_samples[index_split:]
    
    loops = int(np.ceil(len(index_train) / size_batch))
    epoch_metric = {'loss':[], 'accu':[], 'f1':[]}
    
    t_start = time.time()
    
    for epoch in range(epochs):
        batch_loss = []
        optimizer.zero_grad()

        if shuffle_data:
            random.shuffle(index_train)
        
        for loop in range(loops):
            indexes = index_train[loop * size_batch:loop * size_batch + size_batch]
            x = nodeAttributes_1[indexes, :]
            y = model(x)
            l = labels[indexes]
            loss = lossfun(y, l)
            batch_loss.append(loss.item())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        el = sum(batch_loss) / len(batch_loss)
        epoch_metric['loss'].append(el)
        with torch.no_grad():
            x = nodeAttributes_1[index_val, :]
            y = model(x)
            pred = torch.argmax(y, dim=1).cpu().detach()
            true = labels[index_val].cpu().detach()
            accu = accuracy_score(pred, true)
            f1 = f1_score(pred, true, average='macro')
        epoch_metric['accu'].append(accu)
        epoch_metric['f1'].append(f1)
        print('epoch loss: {}, accu/f1:{}/{}, epoch:{}/{}, time:{}'.format(el, accu, f1, epoch, epochs, time.time()-t_start))
        
        with open(lossSave, 'wb') as f:
              pickle.dump(epoch_metric, f)
        torch.save(model.state_dict(), modelSave)

In [None]:
train_nodeAttributes(model=model,
                      nodeAttributes_1=nodeAttributes_1,
                      labels=Nclasses,
                      size_batch=size_batch,
                      epochs=epochs,
                      lr=lr,
                      weight_decay=weight_decay,
                      shuffle_data=shuffle_data,
                      device=device
                      )

In [None]:
with open('./models/pre1/nodeLoss_1.pickle', 'rb') as f:
    graphLoss_1 = pickle.load(f)