In [1]:
%load_ext autoreload
%autoreload 2

In [31]:
import time
import argparse
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from math import ceil

import torch
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.optim.lr_scheduler import StepLR

from utils import load_file, preprocessing, get_vocab, load_embeddings, create_gows, accuracy, generate_batches, AverageMeter, train,train_simsiam, load_R8, dotdict, knn
from models import MPAD, SimSiam
from scipy import sparse

### Set-up Model

In [3]:
args = {}
args['path_to_dataset'] = '../datasets/R8.txt'
args['path_to_embeddings'] = "../GoogleNews-vectors-negative300.bin"
args['no_cuda'] = False
args['epochs'] = 200
args['lr'] = 0.001
args['hidden'] = 64
args['penultimate'] = 64
args['message_passing_layers']=2
args['window_size'] = 2
args['directed'] = True
args['use_master_node'] = True
args['normalize'] = True
args['dropout'] = 0.5
args['batch_size'] = 128
args['patience'] = 20
args['rand_node_drop'] = 0.3
args['dataset'] = 'subjectivity'
args['use_unique'] = True
args = dotdict(args)

We are saving a different file for every epoch of NLP augmentations => Therefore, we create a utility loader to process each of these files instead of loading up front. 
LabelEncoder, Vocab and Embeddings are shared across augmented files. We need to initialize the vocab and embeddings over the superset of augmented sentences. 

In [4]:
docs, _ = load_file("../datasets/subjectivity/unique/consolidated.txt")
#docs, _ = load_file("../datasets/subjectivity.txt")
docs = preprocessing(docs)

In [5]:
vocab = get_vocab(docs)
embeddings = load_embeddings("../GoogleNews-vectors-negative300.bin", vocab)

Vocabulary size:  36104
Existing vectors: 29329


In [6]:
l_enc = LabelEncoder()

In [7]:
def load_epoch_txt(epoch_num,l_enc,args):
    file_name = "../datasets/{dataset}/{unique}/{epoch_num}.txt".format(dataset=args.dataset,unique ="unique" if args.use_unique else "non_unique",epoch_num = epoch_num)
    docs, class_labels = load_file(file_name)
    docs = preprocessing(docs)
    class_labels = l_enc.fit_transform(class_labels)
    nclass = np.unique(class_labels).size
    y = list()
    for i in range(len(class_labels)):
        t = np.zeros(1)
        t[0] = class_labels[i]
        y.append(t)
        
    adj, features, _ = create_gows(docs, vocab, args.window_size, args.directed, args.normalize, args.use_master_node)
    return adj,features,y

In [9]:
%%time
adj,features,y = load_epoch_txt(epoch_num=0,l_enc=l_enc,args=args)

CPU times: user 18.3 s, sys: 261 ms, total: 18.6 s
Wall time: 18.5 s


### Create training splits

These splits will be shared over all epochs!

In [10]:
kf = KFold(n_splits=2, shuffle=True)
it = 0
accs = list()
train_index, test_index =  kf.split(y)
train_index = train_index[0]
test_index = test_index[0]

In [11]:
idx = np.random.permutation(train_index)
train_index = idx[:int(idx.size*0.9)].tolist()
val_index = idx[int(idx.size*0.9):].tolist()

In [12]:
n_train = len(train_index)
n_val = len(val_index)
n_test = len(test_index)

print("TRAIN: ",n_train)
print("VAL: ",n_val)
print("TEST: ",n_test)

TRAIN:  4497
VAL:  500
TEST:  4998


### Extract Val,Test Sets

In [13]:
adj, features, _ = load_epoch_txt(0,l_enc,args)

In [14]:
%%time

adj_val = [adj[i] for i in val_index]
features_val = [features[i] for i in val_index]
y_val = [y[i] for i in val_index]
adj_val, features_val, batch_n_graphs_val, y_val = generate_batches(adj_val, features_val, y_val, args.batch_size, args.use_master_node)

adj_test = [adj[i] for i in test_index]
features_test = [features[i] for i in test_index]
y_test = [y[i] for i in test_index]
adj_test, features_test, batch_n_graphs_test, y_test = generate_batches(adj_test, features_test, y_test, args.batch_size, args.use_master_node)

CPU times: user 11.3 s, sys: 200 ms, total: 11.5 s
Wall time: 11.5 s


### Intialize Model

In [16]:
if args.cuda:
    model.cuda()
    adj_train = [x.cuda() for x in adj_train]
    features_train = [x.cuda() for x in features_train]
    batch_n_graphs_train = [x.cuda() for x in batch_n_graphs_train]
    y_train = [x.cuda() for x in y_train]
    adj_val = [x.cuda() for x in adj_val]
    features_val = [x.cuda() for x in features_val]
    batch_n_graphs_val = [x.cuda() for x in batch_n_graphs_val]
    y_val = [x.cuda() for x in y_val]
    adj_test = [x.cuda() for x in adj_test]
    features_test = [x.cuda() for x in features_test]
    batch_n_graphs_test = [x.cuda() for x in batch_n_graphs_test]
    y_test = [x.cuda() for x in y_test]

In [17]:
args.cuda = True

In [27]:
n_class = 2
model = MPAD(embeddings.shape[1], 
             args.message_passing_layers, 
             args.hidden,
             args.penultimate, 
             n_class, 
             args.dropout,
             embeddings, 
             args.use_master_node)

model.embedding_dim = 64

sim = SimSiam(backbone=model,project_dim=64,bottle_neck_dim=10)

parameters = filter(lambda p: p.requires_grad, sim.parameters())

In [28]:
args['lr'] = 0.001
optimizer = optim.Adam(parameters, lr=args.lr)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=8, eta_min=1e-5,verbose=True)
#scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.5)

Adjusting learning rate of group 0 to 1.0000e-03.


In [None]:
# args['lr'] = 0.03
# optimizer = optim.SGD(parameters, lr=args.lr)
# scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=8, eta_min=1e-5,verbose=True)
# #scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.5)

In [24]:
args.epochs = 50
n_train_batches = ceil(n_train/args.batch_size)
n_val_batches = ceil(n_val/args.batch_size)
n_test_batches = ceil(n_test/args.batch_size)

In [32]:
best_acc = 0

for epoch in range(args.epochs):
    
    """
    LOAD TWO VIEWS
    """
    adj_1,features_1,y_1 = load_epoch_txt(epoch_num=epoch,
                                          l_enc=l_enc,
                                          args=args)
    adj_2,features_2,y_2 = load_epoch_txt(epoch_num=args.epochs -epoch,
                                          l_enc=l_enc,
                                          args=args)
    
    
    #extract training samples!
    adj_1 = [adj_1[i] for i in train_index]
    features_1 = [features_1[i] for i in train_index]
    y_1 = [y_1[i] for i in train_index]

    adj_2 = [adj_2[i] for i in train_index]
    features_2 = [features_2[i] for i in train_index]
    y_2 = [y_2[i] for i in train_index]
    
    #generate batches
    adj_1, features_1, batch_n_graphs_1, _ = generate_batches(adj_1,features_1, y_1, args.batch_size, args.use_master_node,shuffle=False)
    adj_2, features_2, batch_n_graphs_2, _ = generate_batches(adj_2,features_2, y_2, args.batch_size, args.use_master_node,shuffle=False)

   
    start = time.time()
    model.train()
    running_loss = 0.0
    # Train for one epoch
    for i in range(n_train_batches):
        
        loss = train_simsiam(sim,
                             optimizer,
                             epoch, 
                             adj_1[i],
                             features_1[i],
                             batch_n_graphs_1[i],
                             adj_2[i],
                             features_2[i])
        running_loss += loss.item()
    print("="*50)
    print('Epoch {0} Loss: {1:.6f}'.format(epoch, running_loss/i))
    if scheduler is not None:
        scheduler.step()

    
    sim.eval()
    
    
    val_embeds = []
    for i in range(n_val_batches):
        with torch.no_grad():
            val_embeds += sim.encoder[0](features_val[i], adj_val[i], batch_n_graphs_val[i])[1]
    val_embeds = torch.stack(val_embeds).numpy()
gmail.c
    test_embeds = []
    for i in range(n_test_batches):
        with torch.no_grad():
            test_embeds += sim.encoder[0](features_test[i], adj_test[i], batch_n_graphs_test[i])[1]
    test_embeds = torch.stack(test_embeds).numpy()
    
    k = knn(val_embeds,torch.cat(y_val),test_embeds, torch.cat(y_test))

Epoch Loss:  -0.38832876767430985
Adjusting learning rate of group 0 to 8.5502e-04.
KNN Acc:  0.8395358143257303
Epoch Loss:  -0.544455223424094
Adjusting learning rate of group 0 to 6.9443e-04.
KNN Acc:  0.8457382953181273
Epoch Loss:  -0.6798791902405875
Adjusting learning rate of group 0 to 5.0500e-04.
KNN Acc:  0.8617446978791516
Epoch Loss:  -0.7834587165287563
Adjusting learning rate of group 0 to 3.1557e-04.
KNN Acc:  0.8625450180072028
Epoch Loss:  -0.827249881199428
Adjusting learning rate of group 0 to 1.5498e-04.
KNN Acc:  0.8641456582633054
Epoch Loss:  -0.8456495642662049
Adjusting learning rate of group 0 to 4.7680e-05.
KNN Acc:  0.8613445378151261
Epoch Loss:  -0.8482923201152257
Adjusting learning rate of group 0 to 1.0000e-05.
KNN Acc:  0.8555422168867547
Epoch Loss:  -0.8494300331388202
Adjusting learning rate of group 0 to 4.7680e-05.
KNN Acc:  0.85734293717487
Epoch Loss:  -0.8501284956932068
Adjusting learning rate of group 0 to 1.5498e-04.
KNN Acc:  0.854341736694

KeyboardInterrupt: 