In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
import torch.autograd as autograd
from torch.autograd import Variable
from torch.optim.lr_scheduler import StepLR

import ConfigParser
from tqdm import tqdm
from time import time
import cPickle as pickle
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from ranking_metrics import compute_mrr, precision_at_k, compute_map
import sys

np.random.seed(0)
#torch.manual_seed(0)

config = ConfigParser.ConfigParser()
config.readfp(open(r'../src/config.ini'))
SAVE_PATH = config.get('paths', 'save_path')
DATA_FILE_NAME = config.get('paths', 'extracted_data_file_name')
TRAIN_TEST_FILE_NAME = config.get('paths', 'train_test_file_name')
SAVE_NAME = config.get('cnn_params', 'save_name')
NUM_NEGATIVE = int(config.get('data_params', 'NUM_NEGATIVE')) 

MAX_TITLE_LEN = int(config.get('data_params', 'MAX_TITLE_LEN'))
MAX_BODY_LEN = int(config.get('data_params', 'MAX_BODY_LEN'))

data_filename = SAVE_PATH + DATA_FILE_NAME
train_test_filename = SAVE_PATH + TRAIN_TEST_FILE_NAME

print "loading pickled data..."
tic = time()
with open(data_filename) as f:  
    train_text_df, train_idx_df, dev_idx_df, test_idx_df, embeddings, word_to_idx = pickle.load(f)
f.close()
with open(train_test_filename) as f:
    train_data, val_data, test_data = pickle.load(f)
f.close()
toc = time()
print "elapsed time: %.2f sec" %(toc - tic)

loading pickled data...
elapsed time: 156.45 sec


In [11]:
#training parameters
num_epochs = 2 #16
batch_size = 32

#model parameters
embed_num = len(word_to_idx)
embed_dim = len(embeddings[0])
kernel_num = 150  #TODO: tune
kernel_sizes = range(2,6)
learning_rate = 1e-3 
weight_decay = 1e-5

class  CNN(nn.Module):
    def __init__(self, embed_num, embed_dim, kernel_num, kernel_sizes):
        super(CNN,self).__init__()
        V = embed_num
        D = embed_dim
        Ci = 1            #input channel
        Co = kernel_num   #depth
        Ks = kernel_sizes #height of each filter

        self.embed = nn.Embedding(V, D)
        self.embed.weight.requires_grad = False
        self.embed.weight.data = torch.from_numpy(embeddings)
        self.convs1 = nn.ModuleList([nn.Conv2d(Ci, Co, (K, D)) for K in Ks])

    def forward(self, x):
        x = self.embed(x) # (N,W,D)
        x = x.unsqueeze(1) # (N,Ci,W,D)
        x = [F.tanh(conv(x)).squeeze(3) for conv in self.convs1] #[(N,Co,W), ...]*len(Ks)
        x = [F.avg_pool1d(i, i.size(2)).squeeze(2) for i in x] #[(N,Co), ...]*len(Ks)
        x = torch.cat(x, 1)
        return x

model = CNN(embed_num, embed_dim, kernel_num, kernel_sizes)

use_gpu = torch.cuda.is_available()
if use_gpu:
    print "found CUDA GPU..."
    model = model.cuda()

print model

#define loss and optimizer
#criterion = nn.MultiMarginLoss(p=1, margin=0.4, size_average=True)
#optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
#scheduler = StepLR(optimizer, step_size=4, gamma=0.5) #half learning rate every 4 epochs

print "training..."
print "train data size:" + str(len(train_data))
print "val data size:" + str(len(val_data))
print "test data size:" + str(len(test_data))

CNN (
  (embed): Embedding(100406, 200)
  (convs1): ModuleList (
    (0): Conv2d(1, 150, kernel_size=(2, 200), stride=(1, 1))
    (1): Conv2d(1, 150, kernel_size=(3, 200), stride=(1, 1))
    (2): Conv2d(1, 150, kernel_size=(4, 200), stride=(1, 1))
    (3): Conv2d(1, 150, kernel_size=(5, 200), stride=(1, 1))
  )
)
training...
train data size:22853
val data size:189
test data size:186


In [12]:
model_parameters = filter(lambda p: p.requires_grad, model.parameters())
cnn_num_params = sum([np.prod(p.size()) for p in model_parameters])
print "number of trainable params: ", cnn_num_params

number of trainable params:  420600


In [None]:
def model_cnn(is_training_phase, num_epochs, data_to_load, idx_df, batch_size, number_negative_examples, model, criterion, optimizer, scheduler, model_name, use_gpu, save_model_at):
    print "Model invoked"
    loss_per_epoch = []
    learning_rate_schedule = []
    patience_cnt = 0

    data_loader = torch.utils.data.DataLoader(
        data_to_load, 
        batch_size = batch_size,
        shuffle = True,
        num_workers = 4, 
        drop_last = True)
    
    if not is_training_phase:
        num_epochs = 1

    for epoch in range(num_epochs):
        print "epoch value: " + str(epoch)
        loss_over_batches = 0.0
        
        if is_training_phase:
            model.train()
            scheduler.step()
        else:
            model.eval()

        for batch in tqdm(data_loader):
            query_idx = batch['query_idx']
            query_title = Variable(batch['query_title'])
            query_body = Variable(batch['query_body'])
            similar_title = Variable(batch['similar_title'])
            similar_body = Variable(batch['similar_body'])

            random_title_list = []
            random_body_list = []
            for ridx in range(number_negative_examples): #number of random negative examples
                random_title_name = 'random_title_' + str(ridx)
                random_body_name = 'random_body_' + str(ridx)
                random_title_list.append(Variable(batch[random_title_name]))
                random_body_list.append(Variable(batch[random_body_name]))

            if use_gpu:
                query_title, query_body = query_title.cuda(), query_body.cuda()
                similar_title, similar_body = similar_title.cuda(), similar_body.cuda()
                random_title_list = map(lambda item: item.cuda(), random_title_list)
                random_body_list = map(lambda item: item.cuda(), random_body_list)

            if is_training_phase:
                optimizer.zero_grad()

            nn_query_title = model(query_title)
            nn_query_body = model(query_body)
            nn_query = (nn_query_title + nn_query_body)/2.0

            nn_similar_title = model(similar_title)
            nn_similar_body = model(similar_body)
            nn_similar = (nn_similar_title + nn_similar_body)/2.0

            nn_random_list = []
            for ridx in range(len(random_title_list)):
                nn_random_title = model(random_title_list[ridx])
                nn_random_body = model(random_body_list[ridx])
                nn_random = (nn_random_title + nn_random_body)/2.0
                nn_random_list.append(nn_random)
            #end for

            cosine_similarity = nn.CosineSimilarity(dim=1, eps=1e-6)
            score_pos = cosine_similarity(nn_query, nn_similar)
        
            score_list = []
            score_list.append(score_pos)
            for ridx in range(len(nn_random_list)):
                score_neg = cosine_similarity(nn_query, nn_random_list[ridx])
                score_list.append(score_neg)

            X_scores = torch.stack(score_list, 1) #[batch_size, K=101]
            print X_scores
            y_targets = Variable(torch.zeros(X_scores.size(0)).type(torch.LongTensor)) #[batch_size]
            if use_gpu:
                y_targets = y_targets.cuda()
            loss = criterion(X_scores, y_targets) #y_target=0
            
            if is_training_phase:
                loss.backward()
                optimizer.step()
            
            loss_over_batches += loss.cpu().data[0]
            
            #save scores to data-frame
            nn_query_idx = query_idx.numpy()
            nn_retrieved_scores = X_scores.data.numpy()[:,1:] #skip positive score
            for row, qidx in enumerate(nn_query_idx):
                idx_df.loc[idx_df['query_id'] == qidx, model_name] = " ".join(nn_retrieved_scores[row,:].astype('str'))
    
        #end for-batch
        loss_per_epoch.append(loss_over_batches)
    
        if is_training_phase:
            early_stop = False
            learning_rate_schedule.append(scheduler.get_lr())
            print "epoch: %4d, training loss: %.4f" %(epoch+1, loss_over_batches)
        
            torch.save(model, save_model_at)

            #early stopping
            patience = 4
            min_delta = 0.1
            if epoch > 0 and (loss_per_epoch[epoch-1] - loss_per_epoch[epoch] > min_delta):
                patience_cnt = 0
            else:
                patience_cnt += 1

            if patience_cnt > patience:
                print "early stopping..."
                early_stop = True
        
            if early_stop:
                if is_training_phase:
                    return loss_per_epoch, idx_df, learning_rate_schedule
                else:
                    return loss_per_epoch, idx_df, []
  
    #end for-epoch
    if is_training_phase:
        return loss_per_epoch, idx_df, learning_rate_schedule
    else:
        return loss_per_epoch, idx_df, []


In [None]:
training_loss, train_idx_df, learning_rate_schedule = model_cnn(True, num_epochs, train_data, train_idx_df, batch_size, 40,  model, criterion, optimizer, scheduler, 'CNN_train', use_gpu, (SAVE_PATH + SAVE_NAME))

print "testing..."
running_test_loss, test_idx_df, _ = model_cnn(False, num_epochs, test_data, test_idx_df, batch_size, 20,  model, criterion, optimizer, scheduler, 'CNN_test', use_gpu, (SAVE_PATH + SAVE_NAME + "test"))

    
print "total test loss: ", running_test_loss
print "number of NaN: \n", test_idx_df.isnull().sum()
#test_idx_df = test_idx_df.dropna() #NaNs are due to restriction: range(100)

In [None]:
#save scored data frame
#test_idx_df.to_csv(SAVE_PATH + '/test_idx_df_scored_cnn.csv', header=True)

print "computing ranking metrics..."
cnn_mrr_test = compute_mrr(test_idx_df, score_name='CNN_test')
print "cnn MRR (test): ", np.mean(cnn_mrr_test)

cnn_pr1_test = precision_at_k(test_idx_df, K=1, score_name='CNN_test')
print "cnn P@1 (test): ", np.mean(cnn_pr1_test)

cnn_pr5_test = precision_at_k(test_idx_df, K=5, score_name='CNN_test')
print "cnn P@5 (test): ", np.mean(cnn_pr5_test)

cnn_map_test = compute_map(test_idx_df, score_name='CNN_test')
print "cnn map (test): ", np.mean(cnn_map_test)


#generate plots
plt.figure()
plt.plot(training_loss, label='Adam')
plt.title("CNN Model Training Loss")
plt.xlabel("Epoch")
plt.ylabel("Training Loss")
plt.legend()
plt.savefig('../figures/cnn_training_loss1.png')

plt.figure()
plt.plot(learning_rate_schedule, label='learning rate')
plt.title("CNN learning rate schedule")
plt.xlabel("Epoch")
plt.ylabel("Learning rate")
plt.legend()
plt.savefig('../figures/cnn_learning_rate_schedule1.png')

"""
plt.figure()
plt.plot(validation_loss, label='Adam')
plt.title("CNN Model Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Validation Loss")
plt.legend()
plt.savefig('../figures/cnn_validation_loss.png')
"""

        