In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
import torch.autograd as autograd
from torch.autograd import Variable
from torch.optim.lr_scheduler import StepLR

import ConfigParser
from tqdm import tqdm
from time import time
import cPickle as pickle
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from ranking_metrics import compute_mrr, precision_at_k, compute_map

np.random.seed(0)
#torch.manual_seed(0)

config = ConfigParser.ConfigParser()
config.readfp(open(r'../src/config.ini'))
SAVE_PATH = config.get('paths', 'save_path')
DATA_FILE_NAME = config.get('paths', 'extracted_data_file_name')
TRAIN_TEST_FILE_NAME = config.get('paths', 'train_test_file_name')
SAVE_NAME = config.get('cnn_params', 'save_name')
NUM_NEGATIVE = int(config.get('data_params', 'NUM_NEGATIVE')) 

MAX_TITLE_LEN = int(config.get('data_params', 'MAX_TITLE_LEN'))
MAX_BODY_LEN = int(config.get('data_params', 'MAX_BODY_LEN'))

data_filename = SAVE_PATH + DATA_FILE_NAME
train_test_filename = SAVE_PATH + TRAIN_TEST_FILE_NAME

print "loading pickled data..."
tic = time()
with open(data_filename) as f:  
    train_text_df, train_idx_df, dev_idx_df, test_idx_df, embeddings, word_to_idx = pickle.load(f)
f.close()
with open(train_test_filename) as f:
    train_data, val_data, test_data = pickle.load(f)
f.close()
toc = time()
print "elapsed time: %.2f sec" %(toc - tic)

loading pickled data...
elapsed time: 60.41 sec


In [20]:
#training parameters
num_epochs = 2 #16
batch_size = 16 

#model parameters
embed_num = len(word_to_idx)
embed_dim = len(embeddings[0])
kernel_num = 100  #TODO: tune
kernel_sizes = range(2,6)
learning_rate = 1e-3 
weight_decay = 1e-5

class  CNN(nn.Module):
    def __init__(self, embed_num, embed_dim, kernel_num, kernel_sizes):
        super(CNN,self).__init__()
        V = embed_num
        D = embed_dim
        Ci = 1            #input channel
        Co = kernel_num   #depth
        Ks = kernel_sizes #height of each filter

        self.embed = nn.Embedding(V, D)
        self.embed.weight.data = torch.from_numpy(embeddings)
        self.convs1 = nn.ModuleList([nn.Conv2d(Ci, Co, (K, D)) for K in Ks])

    def forward(self, x):
        x = self.embed(x) # (N,W,D)
        x = x.unsqueeze(1) # (N,Ci,W,D)
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1] #[(N,Co,W), ...]*len(Ks)
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] #[(N,Co), ...]*len(Ks)
        x = torch.cat(x, 1)
        return x

model = CNN(embed_num, embed_dim, kernel_num, kernel_sizes)

use_gpu = torch.cuda.is_available()
if use_gpu:
    print "found CUDA GPU..."
    model = model.cuda()

print model

#define loss and optimizer
criterion = nn.MultiMarginLoss(p=1, margin=0.4, size_average=True)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
scheduler = StepLR(optimizer, step_size=4, gamma=0.5) #half learning rate every 4 epochs

learning_rate_schedule = [] 
training_loss, validation_loss, test_loss = [], [], []

print "training..."
for epoch in range(num_epochs):
    
    running_train_loss = 0.0
    
    train_data_loader = torch.utils.data.DataLoader(
        train_data, 
        batch_size = batch_size,
        shuffle = True,
        num_workers = 4, 
        drop_last = True)
        
    model.train()
    scheduler.step()
        
    for batch in tqdm(train_data_loader):
    
        query_title = Variable(batch['query_title'])
        query_body = Variable(batch['query_body'])
        similar_title = Variable(batch['similar_title'])
        similar_body = Variable(batch['similar_body'])

        random_title_list = []
        random_body_list = []
        for ridx in range(NUM_NEGATIVE): #number of random negative examples
            random_title_name = 'random_title_' + str(ridx)
            random_body_name = 'random_body_' + str(ridx)
            random_title_list.append(Variable(batch[random_title_name]))
            random_body_list.append(Variable(batch[random_body_name]))

        if use_gpu:
            query_title, query_body = query_title.cuda(), query_body.cuda()
            similar_title, similar_body = similar_title.cuda(), similar_body.cuda()
            random_title_list = map(lambda item: item.cuda(), random_title_list)
            random_body_list = map(lambda item: item.cuda(), random_body_list)
        
        optimizer.zero_grad()

        cnn_query_title = model(query_title)
        cnn_query_body = model(query_body)
        cnn_query = (cnn_query_title + cnn_query_body)/2.0

        cnn_similar_title = model(similar_title)
        cnn_similar_body = model(similar_body)
        cnn_similar = (cnn_similar_title + cnn_similar_body)/2.0

        cnn_random_list = []
        for ridx in range(len(random_title_list)):
            cnn_random_title = model(random_title_list[ridx])
            cnn_random_body = model(random_body_list[ridx])
            cnn_random = (cnn_random_title + cnn_random_body)/2.0
            cnn_random_list.append(cnn_random)
        #end for
           
        cosine_similarity = nn.CosineSimilarity(dim=1, eps=1e-6)
        score_pos = cosine_similarity(cnn_query, cnn_similar)

        score_list = []
        score_list.append(score_pos)
        for ridx in range(len(cnn_random_list)):
            score_neg = cosine_similarity(cnn_query, cnn_random_list[ridx])
            score_list.append(score_neg)

        X_scores = torch.stack(score_list, 1) #[batch_size, K=101]
        print X_scores
        y_targets = Variable(torch.zeros(X_scores.size(0)).type(torch.LongTensor)) #[batch_size]
        if use_gpu:
            y_targets = y_targets.cuda()
        loss = criterion(X_scores, y_targets) #y_target=0
        loss.backward()
        optimizer.step()
                
        running_train_loss += loss.cpu().data[0]        
        
    #end for

    training_loss.append(running_train_loss)
    learning_rate_schedule.append(scheduler.get_lr())
    print "epoch: %4d, training loss: %.4f" %(epoch+1, running_train_loss)
    
    torch.save(model, SAVE_PATH + SAVE_NAME)

    #early stopping
    patience = 4
    min_delta = 0.1
    if epoch == 0:
        patience_cnt = 0
    elif epoch > 0 and training_loss[epoch-1] - training_loss[epoch] > min_delta:
        patience_cnt = 0
    else:
        patience_cnt += 1

    if patience_cnt > patience:
        print "early stopping..."
        break
#end for
"""
print "loading pre-trained model..."
model = torch.load(SAVE_PATH)
if use_gpu:
    print "found CUDA GPU..."
    model = model.cuda()
"""

  0%|          | 0/2 [00:00<?, ?it/s]

CNN (
  (embed): Embedding(100406, 200)
  (convs1): ModuleList (
    (0): Conv2d(1, 100, kernel_size=(2, 200), stride=(1, 1))
    (1): Conv2d(1, 100, kernel_size=(3, 200), stride=(1, 1))
    (2): Conv2d(1, 100, kernel_size=(4, 200), stride=(1, 1))
    (3): Conv2d(1, 100, kernel_size=(5, 200), stride=(1, 1))
  )
)
training...
Variable containing:

Columns 0 to 9 
 0.9621  0.9415  0.9465  0.9624  0.9668  0.9639  0.9668  0.9589  0.9559  0.9505
 0.9795  0.9415  0.9465  0.9624  0.9668  0.9639  0.9668  0.9589  0.9559  0.9505
 0.9492  0.9496  0.9457  0.9468  0.9227  0.9410  0.9430  0.9284  0.9321  0.9416
 0.9450  0.9214  0.9060  0.9234  0.9141  0.9122  0.9036  0.9037  0.9300  0.9222
 0.9385  0.9280  0.9089  0.9163  0.8936  0.9048  0.9171  0.9190  0.9182  0.9267
 0.9184  0.9496  0.9457  0.9468  0.9227  0.9410  0.9430  0.9284  0.9321  0.9416
 0.9550  0.9496  0.9457  0.9468  0.9227  0.9410  0.9430  0.9284  0.9321  0.9416
 0.9734  0.9513  0.9327  0.9553  0.9564  0.9539  0.9418  0.9520  0.9506  0.

 50%|█████     | 1/2 [00:37<00:37, 37.66s/it]

Variable containing:

Columns 0 to 9 
 0.9644  0.9547  0.9553  0.9572  0.9592  0.9618  0.9531  0.9423  0.9494  0.9435
 0.9706  0.9449  0.9570  0.9666  0.9511  0.9459  0.9333  0.9597  0.9605  0.9443
 0.9658  0.9509  0.9694  0.9273  0.9601  0.9690  0.9519  0.9574  0.9511  0.9267
 0.9609  0.9277  0.9319  0.9339  0.9347  0.9304  0.9360  0.9184  0.9245  0.9399
 0.9649  0.9509  0.9694  0.9273  0.9601  0.9690  0.9519  0.9574  0.9511  0.9267
 0.9627  0.9392  0.9357  0.8998  0.9544  0.9572  0.9392  0.9523  0.9482  0.9592
 0.9424  0.9199  0.9165  0.9091  0.8859  0.9167  0.9272  0.8871  0.8876  0.9131
 0.9632  0.9639  0.9535  0.9545  0.9240  0.9209  0.9489  0.9662  0.9523  0.9447
 0.9550  0.9327  0.9333  0.9332  0.9359  0.9161  0.9304  0.9249  0.9268  0.9278
 0.9667  0.9298  0.9382  0.9363  0.9481  0.9176  0.9325  0.9436  0.9256  0.9448
 0.9498  0.9266  0.9318  0.9207  0.9237  0.9280  0.9217  0.9314  0.9290  0.9404
 0.9622  0.9327  0.9333  0.9332  0.9359  0.9161  0.9304  0.9249  0.9268  0.9278
 0

100%|██████████| 2/2 [01:13<00:00, 37.03s/it]


epoch:    1, training loss: 0.7513


  0%|          | 0/2 [00:00<?, ?it/s]

Variable containing:

Columns 0 to 9 
 0.9753  0.9106  0.9152  0.9035  0.9068  0.9112  0.9056  0.9184  0.9153  0.9276
 0.9761  0.9480  0.9440  0.9456  0.9522  0.9525  0.9408  0.9294  0.9360  0.9291
 0.9806  0.9109  0.9148  0.9389  0.9505  0.9635  0.9482  0.9385  0.9290  0.9304
 0.9396  0.8829  0.8780  0.8651  0.8415  0.8872  0.9058  0.8394  0.8378  0.8785
 0.9546  0.8829  0.8780  0.8651  0.8415  0.8872  0.9058  0.8394  0.8378  0.8785
 0.9892  0.9109  0.9148  0.9389  0.9505  0.9635  0.9482  0.9385  0.9290  0.9304
 0.9651  0.9011  0.9006  0.9014  0.9081  0.8801  0.9016  0.8942  0.9012  0.9014
 0.9531  0.9478  0.9524  0.9415  0.9412  0.9150  0.9380  0.9259  0.9383  0.9395
 0.9734  0.9273  0.9237  0.8811  0.9438  0.9461  0.9266  0.9418  0.9369  0.9507
 0.9577  0.9106  0.9152  0.9035  0.9068  0.9112  0.9056  0.9184  0.9153  0.9276
 0.9670  0.9430  0.9644  0.9096  0.9503  0.9653  0.9427  0.9496  0.9393  0.9141
 0.9779  0.9360  0.9517  0.9633  0.9443  0.9365  0.9256  0.9538  0.9562  0.9375
 0

 50%|█████     | 1/2 [00:19<00:19, 19.72s/it]

Variable containing:

Columns 0 to 9 
 0.9261  0.8426  0.8330  0.8161  0.7908  0.8516  0.8795  0.7842  0.7822  0.8403
 0.9088  0.8426  0.8330  0.8161  0.7908  0.8516  0.8795  0.7842  0.7822  0.8403
 0.9655  0.8672  0.8211  0.8671  0.8600  0.8401  0.8106  0.8294  0.8848  0.8639
 0.9574  0.8426  0.8330  0.8161  0.7908  0.8516  0.8795  0.7842  0.7822  0.8403
 0.9726  0.9504  0.9377  0.9430  0.8952  0.8873  0.9286  0.9610  0.9364  0.9222
 0.9737  0.8985  0.9046  0.9081  0.9270  0.8798  0.9053  0.9193  0.8923  0.9198
 0.9501  0.8426  0.8330  0.8161  0.7908  0.8516  0.8795  0.7842  0.7822  0.8403
 0.9416  0.9295  0.9533  0.9478  0.9449  0.9203  0.9239  0.9550  0.9397  0.9531
 0.9372  0.8426  0.8330  0.8161  0.7908  0.8516  0.8795  0.7842  0.7822  0.8403
 0.9802  0.9322  0.9444  0.9312  0.9195  0.9372  0.9301  0.9264  0.9173  0.9435
 0.9520  0.8426  0.8330  0.8161  0.7908  0.8516  0.8795  0.7842  0.7822  0.8403
 0.9806  0.9355  0.9259  0.9326  0.9288  0.9305  0.9459  0.9420  0.9049  0.9037
 0

100%|██████████| 2/2 [00:36<00:00, 18.92s/it]


epoch:    2, training loss: 0.6498


'\nprint "loading pre-trained model..."\nmodel = torch.load(SAVE_PATH)\nif use_gpu:\n    print "found CUDA GPU..."\n    model = model.cuda()\n'

In [24]:
print "scoring test questions..."
running_test_loss = 0.0

test_data_loader = torch.utils.data.DataLoader(
    test_data, 
    batch_size = batch_size,
    shuffle = False,
    num_workers = 4, 
    drop_last = True)
        
model.eval()

for batch in tqdm(test_data_loader):
    print "here!"
    query_idx = batch['query_idx']
    query_title = Variable(batch['query_title'])
    query_body = Variable(batch['query_body'])
    similar_title = Variable(batch['similar_title'])
    similar_body = Variable(batch['similar_body'])

    random_title_list = []
    random_body_list = []
    for ridx in range(20): #number of retrieved (bm25) examples
        random_title_name = 'random_title_' + str(ridx)
        random_body_name = 'random_body_' + str(ridx)
        random_title_list.append(Variable(batch[random_title_name]))
        random_body_list.append(Variable(batch[random_body_name]))

    if use_gpu:
        query_title, query_body = query_title.cuda(), query_body.cuda()
        similar_title, similar_body = similar_title.cuda(), similar_body.cuda()
        random_title_list = map(lambda item: item.cuda(), random_title_list)
        random_body_list = map(lambda item: item.cuda(), random_body_list)
    
    cnn_query_title = model(query_title)
    cnn_query_body = model(query_body)
    cnn_query = (cnn_query_title + cnn_query_body)/2.0

    cnn_similar_title = model(similar_title)
    cnn_similar_body = model(similar_body)
    cnn_similar = (cnn_similar_title + cnn_similar_body)/2.0

    cnn_random_list = []
    for ridx in range(len(random_title_list)):
        cnn_random_title = model(random_title_list[ridx])
        cnn_random_body = model(random_body_list[ridx])
        cnn_random = (cnn_random_title + cnn_random_body)/2.0
        cnn_random_list.append(cnn_random)
    #end for
           
    cosine_similarity = nn.CosineSimilarity(dim=1, eps=1e-6)
    score_pos = cosine_similarity(cnn_query, cnn_similar)

    score_list = []
    score_list.append(score_pos)
    for ridx in range(len(cnn_random_list)):
        score_neg = cosine_similarity(cnn_query, cnn_random_list[ridx])
        score_list.append(score_neg)
    
    print "haha!"
    X_scores = torch.stack(score_list, 1) #[batch_size, K=101]
    print X_scores
    y_targets = Variable(torch.zeros(X_scores.size(0)).type(torch.LongTensor)) #[batch_size]
    if use_gpu:
        y_targets = y_targets.cuda()
    loss = criterion(X_scores, y_targets) #y_target=0
    running_test_loss += loss.cpu().data[0]        
    
    #save scores to data-frame
    cnn_query_idx = query_idx.numpy()
    cnn_retrieved_scores = X_scores.data.numpy()[:,1:] #skip positive score
    for row, qidx in enumerate(cnn_query_idx):
        test_idx_df.loc[test_idx_df['query_id'] == qidx, 'cnn_score'] = " ".join(cnn_retrieved_scores[row,:].astype('str'))
#end for        
 

  0%|          | 0/1 [00:00<?, ?it/s]

scoring test questions...
here!


100%|██████████| 1/1 [00:03<00:00,  3.10s/it]

Variable containing:
 0.9475
 0.9597
 0.9268
 0.9597
 0.9587
 0.9488
 0.9318
 0.9811
 0.9560
 0.9441
 0.9574
 0.9556
 0.9496
 0.9635
 0.9411
 0.9426
[torch.FloatTensor of size 16]

Variable containing:
 0.9371
 0.9567
 0.9806
 0.9457
 0.9532
 0.9634
 0.9439
 0.9590
 0.9528
 0.9332
 0.9440
 0.9572
 0.9582
 0.9401
 0.9571
 0.9437
[torch.FloatTensor of size 16]

Variable containing:
 0.9271
 0.9363
 0.9626
 0.9423
 0.9533
 0.9524
 0.9372
 0.9616
 0.9573
 0.9460
 0.9368
 0.9520
 0.9548
 0.9388
 0.9510
 0.9510
[torch.FloatTensor of size 16]

Variable containing:
 0.9515
 0.9599
 0.9756
 0.9404
 0.9525
 0.9634
 0.9482
 0.9472
 0.9603
 0.9467
 0.9396
 0.9501
 0.9553
 0.9477
 0.9517
 0.9413
[torch.FloatTensor of size 16]

Variable containing:
 0.8974
 0.9568
 0.9772
 0.9383
 0.9521
 0.9432
 0.9407
 0.9353
 0.9553
 0.9406
 0.9404
 0.9656
 0.9476
 0.9641
 0.9164
 0.9398
[torch.FloatTensor of size 16]

Variable containing:
 0.9502
 0.9463
 0.9793
 0.9382
 0.9587
 0.9639
 0.9320
 0.9661
 0.9487
 0




In [28]:
print cnn_retrieved_scores[0:10,:].astype('str')

[['0.947493553162' '0.937081992626' '0.92705309391' '0.951512098312'
  '0.897426068783' '0.950182437897' '0.930031895638' '0.944299459457'
  '0.917933762074' '0.942549645901' '0.937607169151' '0.944585978985'
  '0.941568851471' '0.936048328876' '0.948451638222' '0.9079413414'
  '0.937844455242' '0.94773453474' '0.94703400135' '0.950279891491']
 ['0.959692239761' '0.956662356853' '0.93625587225' '0.959926307201'
  '0.956772506237' '0.94633603096' '0.957779526711' '0.958075761795'
  '0.958883345127' '0.963328659534' '0.947406113148' '0.963506400585'
  '0.95674341917' '0.966477632523' '0.948509156704' '0.946211159229'
  '0.95744228363' '0.96197271347' '0.942865312099' '0.941648602486']
 ['0.926836252213' '0.980607330799' '0.962563574314' '0.975600838661'
  '0.977192044258' '0.979271113873' '0.979290664196' '0.965135753155'
  '0.951937675476' '0.96437650919' '0.975035309792' '0.982052326202'
  '0.968396604061' '0.973123669624' '0.968552052975' '0.960977196693'
  '0.97052615881' '0.97241491

In [26]:
print test_idx_df.loc[6,'random_id']
print test_idx_df.loc[6,'bm25_score']

51865 48116 271591 509697 457104 456649 451242 486395 341375 450253 502203 343014 100874 448228 390059 499089 287958 94637 239160 496852
53.8664 53.633076 53.01136 52.28167 51.24043 49.793518 49.464836 49.392956 47.692867 47.28724 47.100048 46.77263 46.53052 46.177017 46.04682 45.246006 45.207558 44.60032 44.574905 44.140083


In [27]:
#save scored data frame
#test_idx_df.to_csv(SAVE_PATH + '/test_idx_df_scored_cnn.csv', header=True)

print "computing ranking metrics..."
cnn_mrr_test = compute_mrr(test_idx_df, score_name='cnn_score')
print "cnn MRR (test): ", np.mean(cnn_mrr_test)

cnn_pr1_test = precision_at_k(test_idx_df, K=1, score_name='cnn_score')
print "cnn P@1 (test): ", np.mean(cnn_pr1_test)

cnn_pr5_test = precision_at_k(test_idx_df, K=5, score_name='cnn_score')
print "cnn P@5 (test): ", np.mean(cnn_pr5_test)

cnn_map_test = compute_map(test_idx_df, score_name='cnn_score')
print "cnn map (test): ", np.mean(cnn_map_test)

computing ranking metrics...
cnn MRR (test):  0.646306818182
cnn P@1 (test):  0.5
cnn P@5 (test):  0.4625
cnn map (test):  0.532974881509


In [None]:
#generate plots
plt.figure()
plt.plot(training_loss, label='Adam')
plt.title("CNN Model Training Loss")
plt.xlabel("Epoch")
plt.ylabel("Training Loss")
plt.legend()
plt.savefig('../figures/cnn_training_loss.png')

plt.figure()
plt.plot(learning_rate_schedule, label='learning rate')
plt.title("CNN learning rate schedule")
plt.xlabel("Epoch")
plt.ylabel("Learning rate")
plt.legend()
plt.savefig('../figures/cnn_learning_rate_schedule.png')

"""
plt.figure()
plt.plot(validation_loss, label='Adam')
plt.title("CNN Model Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Validation Loss")
plt.legend()
plt.savefig('../figures/cnn_validation_loss.png')
"""