In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
import torch.autograd as autograd
from torch.autograd import Variable
from torch.optim.lr_scheduler import StepLR

import ConfigParser
from tqdm import tqdm
from time import time
import cPickle as pickle
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from ranking_metrics import compute_mrr, precision_at_k, compute_map
import sys

np.random.seed(0)
#torch.manual_seed(0)

config = ConfigParser.ConfigParser()
config.readfp(open(r'../src/config.ini'))
SAVE_PATH = config.get('paths', 'save_path')
DATA_FILE_NAME = config.get('paths', 'extracted_data_file_name')
TRAIN_TEST_FILE_NAME = config.get('paths', 'train_test_file_name')
SAVE_NAME = config.get('cnn_params', 'save_name')
NUM_NEGATIVE = int(config.get('data_params', 'NUM_NEGATIVE')) 

MAX_TITLE_LEN = int(config.get('data_params', 'MAX_TITLE_LEN'))
MAX_BODY_LEN = int(config.get('data_params', 'MAX_BODY_LEN'))

data_filename = SAVE_PATH + DATA_FILE_NAME
train_test_filename = SAVE_PATH + TRAIN_TEST_FILE_NAME

print "loading pickled data..."
tic = time()
with open(data_filename) as f:  
    train_text_df, train_idx_df, dev_idx_df, test_idx_df, embeddings, word_to_idx = pickle.load(f)
f.close()
with open(train_test_filename) as f:
    train_data, val_data, test_data = pickle.load(f)
f.close()
toc = time()
print "elapsed time: %.2f sec" %(toc - tic)

loading pickled data...
elapsed time: 20.98 sec


In [2]:
#training parameters
num_epochs = 2 #16
batch_size = 16 

#model parameters
embed_num = len(word_to_idx)
embed_dim = len(embeddings[0])
kernel_num = 100  #TODO: tune
kernel_sizes = range(2,6)
learning_rate = 1e-3 
weight_decay = 1e-5

class  CNN(nn.Module):
    def __init__(self, embed_num, embed_dim, kernel_num, kernel_sizes):
        super(CNN,self).__init__()
        V = embed_num
        D = embed_dim
        Ci = 1            #input channel
        Co = kernel_num   #depth
        Ks = kernel_sizes #height of each filter

        self.embed = nn.Embedding(V, D)
        self.embed.requires_grad = False
        self.embed.weight.data = torch.from_numpy(embeddings)
        self.convs1 = nn.ModuleList([nn.Conv2d(Ci, Co, (K, D)) for K in Ks])

    def forward(self, x):
        x = self.embed(x) # (N,W,D)
        x = x.unsqueeze(1) # (N,Ci,W,D)
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1] #[(N,Co,W), ...]*len(Ks)
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] #[(N,Co), ...]*len(Ks)
        x = torch.cat(x, 1)
        return x

model = CNN(embed_num, embed_dim, kernel_num, kernel_sizes)

use_gpu = torch.cuda.is_available()
if use_gpu:
    print "found CUDA GPU..."
    model = model.cuda()

print model

#define loss and optimizer
criterion = nn.MultiMarginLoss(p=1, margin=0.4, size_average=True)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
scheduler = StepLR(optimizer, step_size=4, gamma=0.5) #half learning rate every 4 epochs

print "training..."
print "train data size:" + str(len(train_data))
print "val data size:" + str(len(val_data))
print "test data size:" + str(len(test_data))

CNN (
  (embed): Embedding(100406, 200)
  (convs1): ModuleList (
    (0): Conv2d(1, 100, kernel_size=(2, 200), stride=(1, 1))
    (1): Conv2d(1, 100, kernel_size=(3, 200), stride=(1, 1))
    (2): Conv2d(1, 100, kernel_size=(4, 200), stride=(1, 1))
    (3): Conv2d(1, 100, kernel_size=(5, 200), stride=(1, 1))
  )
)
training...
train data size:36
val data size:20
test data size:20


In [None]:
np.sort(val_data[1].keys())

In [11]:
def model_cnn(is_training_phase, num_epochs, data_to_load, idx_df, batch_size, number_negative_examples, model, criterion, optimizer, scheduler, model_name, use_gpu, save_model_at):
    print "Model invoked"
    loss_per_epoch = []
    learning_rate_schedule = []
    patience_cnt = 0

    data_loader = torch.utils.data.DataLoader(
        data_to_load, 
        batch_size = batch_size,
        shuffle = True,
        num_workers = 4, 
        drop_last = True)
    
    if not is_training_phase:
        num_epochs = 1

    for epoch in range(num_epochs):
        print "epoch value: " + str(epoch)
        loss_over_batches = 0.0
        
        if is_training_phase:
            model.train()
            scheduler.step()
        else:
            model.eval()

        for batch in tqdm(data_loader):
            query_idx = batch['query_idx']
            query_title = Variable(batch['query_title'])
            query_body = Variable(batch['query_body'])
            similar_title = Variable(batch['similar_title'])
            similar_body = Variable(batch['similar_body'])

            random_title_list = []
            random_body_list = []
            for ridx in range(number_negative_examples): #number of random negative examples
                random_title_name = 'random_title_' + str(ridx)
                random_body_name = 'random_body_' + str(ridx)
                random_title_list.append(Variable(batch[random_title_name]))
                random_body_list.append(Variable(batch[random_body_name]))

            if use_gpu:
                query_title, query_body = query_title.cuda(), query_body.cuda()
                similar_title, similar_body = similar_title.cuda(), similar_body.cuda()
                random_title_list = map(lambda item: item.cuda(), random_title_list)
                random_body_list = map(lambda item: item.cuda(), random_body_list)

            if is_training_phase:
                optimizer.zero_grad()

            nn_query_title = model(query_title)
            nn_query_body = model(query_body)
            nn_query = (nn_query_title + nn_query_body)/2.0

            nn_similar_title = model(similar_title)
            nn_similar_body = model(similar_body)
            nn_similar = (nn_similar_title + nn_similar_body)/2.0

            nn_random_list = []
            for ridx in range(len(random_title_list)):
                nn_random_title = model(random_title_list[ridx])
                nn_random_body = model(random_body_list[ridx])
                nn_random = (nn_random_title + nn_random_body)/2.0
                nn_random_list.append(nn_random)
            #end for

            cosine_similarity = nn.CosineSimilarity(dim=1, eps=1e-6)
            score_pos = cosine_similarity(nn_query, nn_similar)
        
            score_list = []
            score_list.append(score_pos)
            for ridx in range(len(nn_random_list)):
                score_neg = cosine_similarity(nn_query, nn_random_list[ridx])
                score_list.append(score_neg)

            X_scores = torch.stack(score_list, 1) #[batch_size, K=101]
            print X_scores
            y_targets = Variable(torch.zeros(X_scores.size(0)).type(torch.LongTensor)) #[batch_size]
            if use_gpu:
                y_targets = y_targets.cuda()
            loss = criterion(X_scores, y_targets) #y_target=0
            
            if is_training_phase:
                loss.backward()
                optimizer.step()
            
            loss_over_batches += loss.cpu().data[0]
            
            #save scores to data-frame
            nn_query_idx = query_idx.numpy()
            nn_retrieved_scores = X_scores.data.numpy()[:,1:] #skip positive score
            for row, qidx in enumerate(nn_query_idx):
                idx_df.loc[idx_df['query_id'] == qidx, model_name] = " ".join(nn_retrieved_scores[row,:].astype('str'))
    
        #end for-batch
        loss_per_epoch.append(loss_over_batches)
    
        if is_training_phase:
            early_stop = False
            learning_rate_schedule.append(scheduler.get_lr())
            print "epoch: %4d, training loss: %.4f" %(epoch+1, loss_over_batches)
        
            torch.save(model, save_model_at)

            #early stopping
            patience = 4
            min_delta = 0.1
            if epoch > 0 and (loss_per_epoch[epoch-1] - loss_per_epoch[epoch] > min_delta):
                patience_cnt = 0
            else:
                patience_cnt += 1

            if patience_cnt > patience:
                print "early stopping..."
                early_stop = True
        
            if early_stop:
                if is_training_phase:
                    return loss_per_epoch, idx_df, learning_rate_schedule
                else:
                    return loss_per_epoch, idx_df, []
  
    #end for-epoch
    if is_training_phase:
        return loss_per_epoch, idx_df, learning_rate_schedule
    else:
        return loss_per_epoch, idx_df, []


In [12]:
training_loss, train_idx_df, learning_rate_schedule = model_cnn(True, num_epochs, train_data, train_idx_df, batch_size, 40,  model, criterion, optimizer, scheduler, 'CNN_train', use_gpu, (SAVE_PATH + SAVE_NAME))

print "testing..."
running_test_loss, test_idx_df, _ = model_cnn(False, num_epochs, test_data, test_idx_df, batch_size, 20,  model, criterion, optimizer, scheduler, 'CNN_test', use_gpu, (SAVE_PATH + SAVE_NAME + "test"))

    
print "total test loss: ", running_test_loss
print "number of NaN: \n", test_idx_df.isnull().sum()
#test_idx_df = test_idx_df.dropna() #NaNs are due to restriction: range(100)

  0%|          | 0/2 [00:00<?, ?it/s]

Model invoked
epoch value: 0
Variable containing:

Columns 0 to 9 
 0.9736  0.8707  0.8590  0.8882  0.9234  0.9430  0.9152  0.9042  0.8974  0.9013
 0.9236  0.7630  0.7479  0.7354  0.7019  0.8110  0.8447  0.7390  0.6661  0.7679
 0.9812  0.8867  0.8608  0.9017  0.9035  0.9089  0.8521  0.9018  0.8834  0.8799
 0.8944  0.7630  0.7479  0.7354  0.7019  0.8110  0.8447  0.7390  0.6661  0.7679
 0.9536  0.8367  0.8264  0.8265  0.8593  0.8467  0.8536  0.8599  0.8427  0.8782
 0.9725  0.7845  0.7676  0.7618  0.7812  0.7302  0.7873  0.7473  0.7749  0.7859
 0.9129  0.7630  0.7479  0.7354  0.7019  0.8110  0.8447  0.7390  0.6661  0.7679
 0.9116  0.7630  0.7479  0.7354  0.7019  0.8110  0.8447  0.7390  0.6661  0.7679
 0.9589  0.8364  0.7804  0.8199  0.8427  0.7776  0.7440  0.7879  0.8565  0.8299
 0.9615  0.9004  0.9467  0.9258  0.9222  0.9023  0.9113  0.9361  0.9293  0.9416
 0.9728  0.9020  0.9255  0.9418  0.9193  0.8957  0.8910  0.9235  0.9320  0.9125
 0.9519  0.7630  0.7479  0.7354  0.7019  0.8110  0.84

 50%|█████     | 1/2 [00:15<00:15, 15.54s/it]

Variable containing:

Columns 0 to 9 
 0.9554  0.8156  0.8404  0.7984  0.7835  0.8119  0.8098  0.7864  0.8209  0.8565
 0.9463  0.6938  0.6687  0.6589  0.6200  0.7592  0.8015  0.6620  0.5722  0.6977
 0.9729  0.9077  0.8565  0.8517  0.8726  0.8732  0.8547  0.8529  0.8462  0.8130
 0.9824  0.8461  0.8261  0.8629  0.9060  0.9359  0.8940  0.8864  0.8741  0.8802
 0.9764  0.8868  0.9305  0.8048  0.8816  0.9298  0.8696  0.8881  0.8588  0.8411
 0.9778  0.8863  0.8492  0.8796  0.8928  0.8560  0.8694  0.8869  0.8347  0.8817
 0.9624  0.8868  0.9305  0.8048  0.8816  0.9298  0.8696  0.8881  0.8588  0.8411
 0.9685  0.7696  0.8304  0.8240  0.8301  0.8087  0.8305  0.7464  0.7961  0.8198
 0.9396  0.7823  0.7673  0.7661  0.8143  0.7917  0.8054  0.8163  0.7907  0.8357
 0.9302  0.6938  0.6687  0.6589  0.6200  0.7592  0.8015  0.6620  0.5722  0.6977
 0.9468  0.8969  0.9135  0.8840  0.8857  0.9054  0.8905  0.8926  0.8867  0.9014
 0.9634  0.7241  0.7019  0.6951  0.7208  0.6598  0.7328  0.6802  0.7198  0.7286
 0

100%|██████████| 2/2 [00:32<00:00, 15.99s/it]


epoch:    1, training loss: 0.5042


  0%|          | 0/2 [00:00<?, ?it/s]

epoch value: 1
Variable containing:

Columns 0 to 9 
 0.9660  0.7064  0.7873  0.7794  0.7890  0.7566  0.7875  0.6757  0.7441  0.7754
 0.9555  0.8588  0.9113  0.7502  0.8477  0.9141  0.8341  0.8611  0.8187  0.8003
 0.8622  0.6171  0.5772  0.5704  0.5325  0.7031  0.7523  0.5785  0.4708  0.6153
 0.9747  0.8271  0.7860  0.8429  0.8442  0.8551  0.7653  0.8426  0.8046  0.8091
 0.9735  0.6566  0.6277  0.6216  0.6525  0.5836  0.6724  0.6044  0.6576  0.6623
 0.9059  0.6171  0.5772  0.5704  0.5325  0.7031  0.7523  0.5785  0.4708  0.6153
 0.9785  0.8565  0.8073  0.8517  0.8688  0.8180  0.8359  0.8563  0.7945  0.8523
 0.9676  0.8333  0.7726  0.7679  0.8124  0.8041  0.8262  0.8283  0.7721  0.8049
 0.9730  0.8588  0.9113  0.7502  0.8477  0.9141  0.8341  0.8611  0.8187  0.8003
 0.9459  0.7422  0.6489  0.7215  0.7583  0.6411  0.6029  0.6610  0.7797  0.7301
 0.9489  0.8713  0.8892  0.8499  0.8530  0.8773  0.8564  0.8623  0.8535  0.8733
 0.9739  0.8563  0.8939  0.9156  0.8755  0.8484  0.8372  0.8832  0.

 50%|█████     | 1/2 [00:15<00:15, 15.64s/it]

Variable containing:

Columns 0 to 9 
 0.9295  0.6396  0.6081  0.6053  0.6884  0.6440  0.6652  0.7016  0.6492  0.7121
 0.9591  0.8133  0.8987  0.8548  0.8553  0.8343  0.8424  0.8771  0.8605  0.8862
 0.9539  0.7178  0.7693  0.6987  0.6761  0.7276  0.7235  0.6715  0.7340  0.7968
 0.9577  0.7791  0.7269  0.7877  0.8480  0.9157  0.8264  0.8332  0.8069  0.8154
 0.9415  0.7723  0.7890  0.7745  0.8057  0.7245  0.7466  0.8117  0.7740  0.8089
 0.9593  0.5790  0.5450  0.5394  0.5748  0.5007  0.6040  0.5201  0.5910  0.5871
 0.9151  0.5418  0.4863  0.4867  0.4516  0.6487  0.7027  0.5010  0.3816  0.5332
 0.9777  0.7511  0.7611  0.6226  0.7827  0.7951  0.7706  0.8067  0.7708  0.8426
 0.9654  0.8473  0.7632  0.7485  0.7930  0.7809  0.7731  0.7612  0.7519  0.6951
 0.8739  0.5418  0.4863  0.4867  0.4516  0.6487  0.7027  0.5010  0.3816  0.5332
 0.9491  0.5418  0.4863  0.4867  0.4516  0.6487  0.7027  0.5010  0.3816  0.5332
 0.9709  0.8435  0.8218  0.8489  0.8421  0.8482  0.8972  0.8795  0.7873  0.7862
 0

100%|██████████| 2/2 [00:36<00:00, 17.06s/it]


epoch:    2, training loss: 0.3483


  0%|          | 0/1 [00:00<?, ?it/s]

testing...
Model invoked
epoch value: 0


100%|██████████| 1/1 [00:03<00:00,  3.75s/it]

Variable containing:

Columns 0 to 9 
 0.7789  0.8584  0.7789  0.8240  0.8274  0.7922  0.8320  0.8275  0.8332  0.7893
 0.9031  0.9031  0.8858  0.8846  0.8807  0.8743  0.8837  0.8809  0.8835  0.8362
 0.8788  0.8676  0.8706  0.8468  0.8788  0.6917  0.8502  0.8643  0.8668  0.8276
 0.8494  0.9180  0.8750  0.8810  0.8757  0.8383  0.8771  0.9265  0.8735  0.9041
 0.9122  0.9122  0.9013  0.9091  0.8980  0.9177  0.8993  0.9161  0.9264  0.8153
 0.8810  0.8810  0.8610  0.8901  0.8876  0.8571  0.7584  0.7137  0.8894  0.8644
 0.8806  0.8817  0.8806  0.8790  0.8701  0.8650  0.9105  0.9009  0.8948  0.8594
 0.9313  0.9313  0.9047  0.9179  0.9077  0.9405  0.9223  0.8931  0.9194  0.8988
 0.9037  0.8946  0.8726  0.8385  0.9037  0.7448  0.8987  0.8341  0.8635  0.8294
 0.9078  0.9078  0.9113  0.9185  0.9027  0.9292  0.9241  0.8992  0.9423  0.9123
 0.8808  0.8895  0.8808  0.9119  0.8919  0.7728  0.8759  0.8764  0.8263  0.9091
 0.8220  0.7868  0.8057  0.7760  0.8174  0.8115  0.8220  0.8189  0.7808  0.7993
 0




NameError: name 'nn_retrieved_scores' is not defined

In [14]:
#save scored data frame
#test_idx_df.to_csv(SAVE_PATH + '/test_idx_df_scored_cnn.csv', header=True)

print "computing ranking metrics..."
cnn_mrr_test = compute_mrr(test_idx_df, score_name='CNN_test')
print "cnn MRR (test): ", np.mean(cnn_mrr_test)

cnn_pr1_test = precision_at_k(test_idx_df, K=1, score_name='CNN_test')
print "cnn P@1 (test): ", np.mean(cnn_pr1_test)

cnn_pr5_test = precision_at_k(test_idx_df, K=5, score_name='CNN_test')
print "cnn P@5 (test): ", np.mean(cnn_pr5_test)

cnn_map_test = compute_map(test_idx_df, score_name='CNN_test')
print "cnn map (test): ", np.mean(cnn_map_test)


#generate plots
plt.figure()
plt.plot(training_loss, label='Adam')
plt.title("CNN Model Training Loss")
plt.xlabel("Epoch")
plt.ylabel("Training Loss")
plt.legend()
plt.savefig('../figures/cnn_training_loss1.png')

plt.figure()
plt.plot(learning_rate_schedule, label='learning rate')
plt.title("CNN learning rate schedule")
plt.xlabel("Epoch")
plt.ylabel("Learning rate")
plt.legend()
plt.savefig('../figures/cnn_learning_rate_schedule1.png')

"""
plt.figure()
plt.plot(validation_loss, label='Adam')
plt.title("CNN Model Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Validation Loss")
plt.legend()
plt.savefig('../figures/cnn_validation_loss.png')
"""

        

computing ranking metrics...


AttributeError: 'float' object has no attribute 'split'