In [12]:
from __future__ import division
import numpy as np
import sys
import csv
import scipy
import time
import math
import cPickle
import random

from scipy.spatial.distance import cosine
from sklearn.feature_extraction.text import *
from sklearn.metrics import *
from sklearn.preprocessing import *

from sklearn.svm import *

import os
os.sys.path.insert(0,'../TwitterData/BPE/subword_nmt')

from apply_bpe import BPE


In [2]:
def preprocess_tweet(s):
    s = s.replace('@user', '<at>').replace('&lt;heart&gt;', '<heart>').replace('&lt;number&gt;', '<number>').replace('  ', ' </s> ').replace('  ', ' ')
    # Make sure we end with </s> token
    while s[-1] == ' ':
        s = s[0:-1]
    if not s[-5:] == ' </s>':
        s = s + ' </s>'
    return s

def process_dialogues(dialogues):
    ''' Removes </d> </s> at end, splits into contexts/ responses '''
    contexts = []
    responses = []
    for d in dialogues:
        d_proc = d[:-3]
        index_list = [i for i, j in enumerate(d_proc) if j == 1]
        split = index_list[-1] + 1
        contexts.append(d_proc[:split])
        responses.append(d_proc[split:] + [1])
    return contexts, responses

def strs_to_idxs(data, bpe, str_to_idx):
    ''' Encodes strings in BPE form '''
    out = []
    for row in data:
        bpe_segmented = bpe.segment(row.strip())
        # Note: there shouldn't be any unknown tokens with BPE!
        #out.append([str_to_idx[word] for word in bpe_segmented.split()])
        out.append([str_to_idx[word] for word in bpe_segmented.split() if word in str_to_idx])

    return out

def idxs_to_strs(data, bpe, idx_to_str):
    ''' Converts from BPE form to strings '''
    out = []
    for row in data:
        out.append(' '.join([idx_to_str[idx] for idx in row if idx in idx_to_str]).replace('@@ ',''))
    return out

def idxs_to_bpestrs(data, bpe, idx_to_str):
    ''' Converts from BPE form to strings '''
    out = []
    for row in data:
        out.append(' '.join([idx_to_str[idx] for idx in row if idx in idx_to_str]))
    return out



In [3]:
def mat_vector_2norm_squared(mat):
    '''
    Takes as input a matrix, and returns a vector correponding to the 2-norm
    of each row vector.
    '''
    norm_list = []
    for i in xrange(mat.shape[0]):
        norm_list.append(np.dot(mat[0], mat[0].T))
    return np.array(norm_list)


def tfidf_retrieval(tfidf_vec, train_contexts_txt, train_responses_txt, output_file):
    print type(tfidf_vec)
    tfidf_vec = tfidf_vec.toarray()
    print tfidf_vec.shape
    prod_mat = np.dot(tfidf_vec, tfidf_vec.T)
    print prod_mat.shape
    prod_mat = prod_mat / mat_vector_2norm_squared(tfidf_vec)
    print prod_mat.shape

    response_list = []
    for i in xrange(len(prod_mat)):
        row = prod_mat[i]
        # No idea what's going on here. See the following page:
        # stackoverflow.com/questions/6910641/how-to-get-indices-of-n-maximum-values-in-a-numpy-array
        ind = np.argpartition(row, -2)[-2:]
        ind = ind[np.argsort(row[ind])][0]
        response_list.append(train_responses_txt[ind])
        print train_contexts_txt[i]
        print response_list[i]

    with open(output_file, 'w') as f1:
        for response in response_list:
            f1.write(response)


In [4]:
if __name__ == '__main__':
    twitter_bpe_dictionary = '../TwitterData/BPE/Twitter_Codes_5000.txt'
    twitter_bpe_separator = '@@'
    twitter_model_dictionary = '../TwitterData/BPE/Dataset.dict.pkl'

    # Load in Twitter dictionaries
    twitter_bpe = BPE(open(twitter_bpe_dictionary, 'r').readlines(), twitter_bpe_separator)
    twitter_dict = cPickle.load(open(twitter_model_dictionary, 'r'))
    twitter_str_to_idx = dict([(tok, tok_id) for tok, tok_id, _, _ in twitter_dict])
    twitter_idx_to_str = dict([(tok_id, tok) for tok, tok_id, _, _ in twitter_dict])

    # Get data, for Twitter
    train_file = '/home/ml/rlowe1/TwitterData/TwitterDataBPE/Train.dialogues.pkl'
    test_file = '/home/ml/rlowe1/TwitterData/TwitterDataBPE/Test.dialogues.pkl'
    output_file = './output.csv'

    with open(train_file) as f1:
        train_data = cPickle.load(f1)
    with open(test_file) as f1:
        test_data = cPickle.load(f1)

    train_contexts, train_responses = process_dialogues(train_data)
    test_contexts, test_responses = process_dialogues(test_data)

    train_contexts_txt = idxs_to_bpestrs(train_contexts, twitter_bpe, twitter_idx_to_str)
    train_responses_txt = idxs_to_bpestrs(train_responses, twitter_bpe, twitter_idx_to_str)


In [7]:
print 'Fitting vectorizer...'
vectorizer = TfidfVectorizer()
vectorizer.fit(train_contexts_txt + train_responses_txt)
c_vec = vectorizer.fit(train_contexts_txt)
r_vec = vectorizer.fit(train_responses_txt)


Fitting vectorizer...


In [21]:
vectorizer.get_feature_names()

[u'00',
 u'10',
 u'11',
 u'20',
 u'aa',
 u'aaa',
 u'aaaa',
 u'aaaaaaaa',
 u'aaah',
 u'aaay',
 u'aah',
 u'ab',
 u'abi',
 u'able',
 u'abo',
 u'about',
 u'above',
 u'abs',
 u'absolu',
 u'absolutely',
 u'abt',
 u'abu',
 u'ac',
 u'acce',
 u'accent',
 u'accep',
 u'accept',
 u'accid',
 u'account',
 u'ace',
 u'ach',
 u'ache',
 u'ack',
 u'acked',
 u'acking',
 u'acks',
 u'across',
 u'act',
 u'acting',
 u'action',
 u'acts',
 u'actu',
 u'actual',
 u'actually',
 u'acy',
 u'ad',
 u'ada',
 u'adam',
 u'add',
 u'added',
 u'addic',
 u'addicted',
 u'address',
 u'ade',
 u'adi',
 u'admit',
 u'ador',
 u'adorable',
 u'adv',
 u'adver',
 u'advice',
 u'af',
 u'aff',
 u'afraid',
 u'afric',
 u'after',
 u'afternoon',
 u'ag',
 u'aga',
 u'again',
 u'against',
 u'age',
 u'ages',
 u'ago',
 u'agre',
 u'agree',
 u'agreed',
 u'ah',
 u'aha',
 u'ahaa',
 u'ahah',
 u'ahaha',
 u'ahahah',
 u'ahahaha',
 u'ahead',
 u'ahh',
 u'ahhh',
 u'ahhhh',
 u'ai',
 u'aight',
 u'ain',
 u'aint',
 u'air',
 u'airport',
 u'aj',
 u'ak',
 u'aka',
 