In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
import torch.autograd as autograd
from torch.autograd import Variable
from torch.optim.lr_scheduler import StepLR

import ConfigParser
from tqdm import tqdm
from time import time
import cPickle as pickle
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from ranking_metrics import compute_mrr, precision_at_k, compute_map

np.random.seed(0)
#torch.manual_seed(0)

config = ConfigParser.ConfigParser()
config.readfp(open(r'../src/config.ini'))

SAVE_PATH = config.get('paths', 'save_path')
DATA_FILE_NAME = config.get('paths', 'extracted_data_file_name')
TRAIN_TEST_FILE_NAME = config.get('paths', 'train_test_file_name')
SAVE_NAME = config.get('rnn_params', 'save_name')
NUM_NEGATIVE = int(config.get('data_params', 'NUM_NEGATIVE')) 

MAX_TITLE_LEN = int(config.get('data_params', 'MAX_TITLE_LEN'))
MAX_BODY_LEN = int(config.get('data_params', 'MAX_BODY_LEN'))

data_filename = SAVE_PATH + DATA_FILE_NAME
train_test_filename = SAVE_PATH + TRAIN_TEST_FILE_NAME

print "loading pickled data..."
tic = time()
with open(data_filename) as f:  
    train_text_df, train_idx_df, dev_idx_df, test_idx_df, embeddings, word_to_idx = pickle.load(f)
f.close()
with open(train_test_filename) as f:
    train_data, val_data, test_data = pickle.load(f)
f.close()
toc = time()
print "elapsed time: %.2f sec" %(toc - tic)

tokenizer = RegexpTokenizer(r'\w+')
stop = set(stopwords.words('english'))

#visualize data
f, (ax1, ax2) = plt.subplots(1, 2)
sns.distplot(train_text_df['title_len'], hist=True, kde=True, color='b', label='title len', ax=ax1)
sns.distplot(train_text_df[train_text_df['body_len'] < 256]['body_len'], hist=True, kde=True, color='r', label='body len', ax=ax2)
ax1.axvline(x=MAX_TITLE_LEN, color='k', linestyle='--', label='max len')
ax2.axvline(x=MAX_BODY_LEN, color='k', linestyle='--', label='max len')
ax1.set_title('title length histogram'); ax1.legend(loc=1); 
ax2.set_title('body length histogram'); ax2.legend(loc=1);
plt.savefig('../figures/question_len_hist.png')

loading pickled data...
elapsed time: 53.29 sec


In [25]:

#model parameters
embed_dim = embeddings.shape[1] #200
hidden_size = 128 #hidden vector dim 
weight_decay = 1e-5 
learning_rate = 1e-3
class  CNN(nn.Module):
    def __init__(self, embed_num, embed_dim, kernel_num, kernel_sizes):
        super(CNN,self).__init__()
        V = embed_num
        D = embed_dim
        Ci = 1            #input channel
        Co = kernel_num   #depth
        Ks = kernel_sizes #height of each filter

        self.embed = nn.Embedding(V, D)
        self.embed.weight.data = torch.from_numpy(embeddings)
        self.convs1 = nn.ModuleList([nn.Conv2d(Ci, Co, (K, D)) for K in Ks])

    def forward(self, x):
        print "--\n input\n--\n"
        print x.data.shape
        x = self.embed(x) # (N,W,D)
        print "--\n after embed\n--\n"
        print x.data.shape
        x = x.unsqueeze(1) # (N,Ci,W,D)
        print "--\n after unsqueeze\n--\n"
        print x.data.shape
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1] #[(N,Co,W), ...]*len(Ks)
        print "--\n RELU\n--\n"
        for t in x:
            print t.data.shape
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] #[(N,Co), ...]*len(Ks)
        print "--\n after maxpool\n--\n"
        for t in x:
            print t.data.shapex = torch.cat(x, 1)
        print "--\n after concat\n--\n"
        print x.data.shape
        return x

batch_size = 32
embed_num = len(word_to_idx)
embed_dim = len(embeddings[0])
kernel_num = 100  
kernel_sizes = range(2,6)
model = CNN(embed_num, embed_dim, kernel_num, kernel_sizes)
print model

CNN (
  (embed): Embedding(100406, 200)
  (convs1): ModuleList (
    (0): Conv2d(1, 100, kernel_size=(2, 200), stride=(1, 1))
    (1): Conv2d(1, 100, kernel_size=(3, 200), stride=(1, 1))
    (2): Conv2d(1, 100, kernel_size=(4, 200), stride=(1, 1))
    (3): Conv2d(1, 100, kernel_size=(5, 200), stride=(1, 1))
  )
)


In [23]:
#m = nn.AvgPool1d(3, stride=1)
x = torch.Tensor([[[20, 30]]])
m = [nn.AvgPool1d(i, i.size(2)).squeeze(2) for i in x]
m(Variable(torch.Tensor([[[1,2,3,4,5,6,7]]])))

RuntimeError: invalid argument 2: dimension 2 out of range of 2D tensor at /opt/conda/conda-bld/pytorch_1503961620703/work/torch/lib/TH/generic/THTensor.c:24

In [26]:

print "training..."
for epoch in range(2):
    
    running_train_loss = 0.0
    
    train_data_loader = torch.utils.data.DataLoader(
        train_data, 
        batch_size = batch_size,
        shuffle = True,
        num_workers = 4, 
        drop_last = True)
        
    model.train()
    scheduler.step()
        
    for batch in tqdm(train_data_loader):
    
        query_title = Variable(batch['query_title'])
        query_body = Variable(batch['query_body'])        
        optimizer.zero_grad()
        print query_title.data.shape
        cnn_query_title = model(query_title)
        sys.exit(0)




  0%|          | 0/1 [00:00<?, ?it/s]

training...


[A[A[A

torch.Size([32, 20])
--
 input
--

torch.Size([32, 20])
--
 after embed
--

torch.Size([32, 20, 200])
--
 after unsqueeze
--

torch.Size([32, 1, 20, 200])
--
 RELU
--

4
torch.Size([32, 100, 19])
--
 after maxpool
--

4
torch.Size([32, 100])
--
 after concat
--

torch.Size([32, 400])





[A[A[A

NameError: name 'sys' is not defined

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
import torch.autograd as autograd
from torch.autograd import Variable
from torch.optim.lr_scheduler import StepLR

import ConfigParser
from tqdm import tqdm
from time import time
import cPickle as pickle
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from ranking_metrics import compute_mrr, precision_at_k, compute_map

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
np.random.seed(0)
from operator import itemgetter, attrgetter
from sklearn.metrics.pairwise import cosine_similarity
from meter import AUCMeter
#torch.manual_seed(0)

In [None]:
config = ConfigParser.ConfigParser()
config.readfp(open(r'../src/config.ini'))
SAVE_PATH = config.get('paths', 'save_path')
DATA_FILE_NAME = config.get('paths', 'extracted_data_file_name')
TRAIN_TEST_FILE_NAME = config.get('paths', 'train_test_file_name')
SAVE_NAME = config.get('cnn_params', 'save_name')
NUM_NEGATIVE = int(config.get('data_params', 'NUM_NEGATIVE')) 
DATA_PATH_TARGET = config.get('paths', 'data_path_target')
MAX_TITLE_LEN = int(config.get('data_params', 'MAX_TITLE_LEN'))
MAX_BODY_LEN = int(config.get('data_params', 'MAX_BODY_LEN'))

data_filename = SAVE_PATH + DATA_FILE_NAME
train_test_filename = SAVE_PATH + TRAIN_TEST_FILE_NAME

In [None]:
'''
class  CNN(nn.Module):
    def __init__(self, embed_num, embed_dim, kernel_num, kernel_sizes):
        super(CNN,self).__init__()
        V = embed_num
        D = embed_dim
        Ci = 1            #input channel
        Co = kernel_num   #depth
        Ks = kernel_sizes #height of each filter

        self.embed = nn.Embedding(V, D)
        self.embed.weight.data = torch.from_numpy(embeddings)
        self.convs1 = nn.ModuleList([nn.Conv2d(Ci, Co, (K, D)) for K in Ks])

    def forward(self, x):
        x = self.embed(x) # (N,W,D)
        x = x.unsqueeze(1) # (N,Ci,W,D)
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1] #[(N,Co,W), ...]*len(Ks)
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] #[(N,Co), ...]*len(Ks)
        x = torch.cat(x, 1)
        return x

#CNN parameters
batch_size = 32
embed_num = len(word_to_idx)
embed_dim = len(embeddings[0])
kernel_num = 100  
kernel_sizes = range(2,6)
model = CNN(embed_num, embed_dim, kernel_num, kernel_sizes)
'''

In [None]:
model1 = torch.load("../trained_models/cnn_baseline_full_40neg.pt")

In [None]:
train_text_df['title_body'] = train_text_df['title'] + " " + train_text_df['body']

In [None]:
vectorizer = TfidfVectorizer(max_df=0.8, max_features=None,
                                 min_df=2, stop_words='english', strip_accents = 'ascii',
                             )

In [None]:
vec_obj = vectorizer.fit(train_text_df['title_body'].tolist())

In [None]:
target_text_file = DATA_PATH_TARGET + 'corpus.txt'
target_text_df = pd.read_table(target_text_file, sep='\t', header=None)
target_text_df.columns = ['id', 'title', 'body']


In [None]:
def get_similarity(embed1, embed2):
    # embed1, embed2 could be tf-idf vectors, word embeddings, anything.
    return cosine_similarity(embed1, embed2)

def process_file(query_id_path, text_pd, vectorizer, ground_truth):
    similarity_vector = []
    ground_truth_arr = []
    
    data_frame = pd.read_table(query_id_path, sep=' ', header=None)
    data_frame.columns = ['query_id', 'candidate_id']
    
    #num_samples = min(100,data_frame.shape[0])
    num_samples = data_frame.shape[0]
    for idx in tqdm(range(num_samples)):
        #try:
            ind1 = np.where(text_pd['idz'] == data_frame.loc[idx,'query_id'])
            ind2 = np.where(text_pd['idz'] == data_frame.loc[idx,'candidate_id'])
            ind1 = int(ind1[0])
            ind2 = int(ind2[0])
            q1 = text_pd.loc[ind1,'body']
            q2 = text_pd.loc[ind2,'body']
            s = get_similarity(vectorizer.transform([q1]),vectorizer.transform([q2]))
            similarity_vector.append(float(s[0][0]))
            ground_truth_arr.append(ground_truth)
        #except:
         #    print "oopsie1" 
            
        
    return similarity_vector, ground_truth_arr


In [None]:
text_pd = pd.read_table(DATA_PATH_TARGET + 'corpus.txt', sep='\t', header=None)
text_pd.columns = ['idz', 'text','body']
text_pd['body'] = text_pd['text'] + " " + text_pd['body']
text_pd = text_pd.dropna()
text_pd = text_pd.reset_index()

In [None]:
auc_obj = AUCMeter()
target_dev_neg = DATA_PATH_TARGET + 'test.neg.txt'
sim_dev_neg, ground_truth_neg = process_file(target_dev_neg, text_pd, vectorizer, 0)

target_dev_pos = DATA_PATH_TARGET + 'test.pos.txt'
sim_dev_pos, ground_truth_pos = process_file(target_dev_pos, text_pd, vectorizer, 1)

In [None]:
print len(sim_dev_pos)
auc_meter = AUCMeter()
auc_meter.add(np.array(sim_dev_pos), np.array(ground_truth_pos))
auc_meter.add(np.array(sim_dev_neg), np.array(ground_truth_neg))
print auc_meter.value(0.05)

In [None]:
auc_meter = AUCMeter()
auc_meter.add(np.array([0.4,0.2,0.4,0.2]),np.array([1,1,1,0]))


In [None]:
idf = vectorizer.idf_
xy = dict(zip(vectorizer.get_feature_names(), idf))
sorted_x = sorted(xy.items(), key= itemgetter(1))
xxx = pd.DataFrame(sorted_x)
writer = pd.ExcelWriter('output.xlsx')
xxx.to_excel(writer,'Sheet1')
writer.save()