In [None]:
'bbcsport-emd_tr_te_split.mat'
'twitter-emd_tr_te_split.mat'
'r8-emd_tr_te3.mat'
'amazon-emd_tr_te_split.mat'
'classic-emd_tr_te_split.mat'
'ohsumed-emd_tr_te_ix.mat'

In [None]:
import numpy as np
import lda
import ot

from sklearn.metrics.pairwise import euclidean_distances
import scipy.io as sio
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer


# load data from the WMD paper
# each data file contains the words, the embedding vector for each word, the bow vector for each word,


# Reduce vocabulary size by stemming and removing stop words.
def reduce_vocab(bow_data, vocab, embed_vocab, embed_aggregate='mean'):
    """Reduce vocabulary size by stemming and removing stop words.
    """
    vocab = np.array(vocab)
    short = np.array([len(w) > 2 for w in vocab])
    stop_words = set(stopwords.words('english'))
    stop = np.array([w not in stop_words for w in vocab])
    reduced_vocab = vocab[np.logical_and(short, stop)]
    reduced_bow_data = bow_data[:, np.logical_and(short, stop)]
    stemmer = SnowballStemmer("english")
    stemmed_dict = {}
    stemmed_idx_mapping = {}
    stemmed_vocab = []
    for i, w in enumerate(reduced_vocab):
        stem_w = stemmer.stem(w)
        if stem_w in stemmed_vocab:
            stemmed_dict[stem_w].append(w)
            stemmed_idx_mapping[stemmed_vocab.index(stem_w)].append(i)
        else:
            stemmed_dict[stem_w] = [w]
            stemmed_vocab.append(stem_w)
            stemmed_idx_mapping[stemmed_vocab.index(stem_w)] = [i]

    stemmed_bow_data = np.zeros((bow_data.shape[0], len(stemmed_vocab)),
                                dtype=np.int)
    for i in range(len(stemmed_vocab)):
        stemmed_bow_data[:, i] = reduced_bow_data[:, stemmed_idx_mapping[i]].sum(axis=1).flatten()

    word_counts = stemmed_bow_data.sum(axis=0)
    stemmed_reduced_vocab = np.array(stemmed_vocab)[word_counts > 2].tolist()
    stemmed_reduced_bow_data = stemmed_bow_data[:, word_counts > 2]

    stemmed_reduced_embed_vocab = {}
    for w in stemmed_reduced_vocab:
        old_w_embed = [embed_vocab[w_old] for w_old in stemmed_dict[w]]
        if embed_aggregate == 'mean':
            new_w_embed = np.mean(old_w_embed, axis=0)
        elif embed_aggregate == 'first':
            new_w_embed = old_w_embed[0]
        else:
            print('Unknown embedding aggregation')
            break
        stemmed_reduced_embed_vocab[w] = new_w_embed

    return (stemmed_reduced_vocab,
            stemmed_reduced_embed_vocab,
            stemmed_reduced_bow_data)


def change_embeddings(vocab, bow_data, embed_path):
    """Change embedding data if vocabulary has been reduced."""
    all_embed_vocab = {}
    with open(embed_path, 'r') as file:
        for line in file.readlines():
            word = line.split(' ')[0]
            embedding = [float(x) for x in line.split(' ')[1:]]
            all_embed_vocab[word] = embedding

    data_embed_vocab = {}
    new_vocab_idx = []
    new_vocab = []
    for i, w in enumerate(vocab):
        if w in all_embed_vocab:
            data_embed_vocab[w] = all_embed_vocab[w]
            new_vocab_idx.append(i)
            new_vocab.append(w)
    bow_data = bow_data[:, new_vocab_idx]
    return new_vocab, data_embed_vocab, bow_data
  
# loader
def loader(data_path,
           embeddings_path,
           p=1,
           K_lda=70,
           glove_embeddings=True,
           stemming=True):


    data_all = sio.loadmat('./data/bbcsport-emd_tr_te_split.mat', squeeze_me=True, chars_as_strings=True)  # dict

    if 'Y' in data_all:
        y_all = data_all['Y'].astype(np.int)
    else:
        y_all = np.concatenate((data_all['yte'].astype(np.int), data_all['ytr'].astype(np.int)), axis=1)

    if 'X' in data_all:
        embed_all = data_all['X']
    else:
        embed_all = np.concatenate((data_all['xte'], data_all['xtr']), axis=1)

    if 'BOW_X' in data_all:
        BOW_all = data_all['BOW_X']
    else:
        BOW_all = np.concatenate((data_all['BOW_xte'], data_all['BOW_xtr']), axis=1)

    if 'words' in data_all:
        words_all = data_all['words']
    else:
        words_all = np.concatenate((data_all['words_tr'], data_all['words_te']), axis=1)

    vocab = []
    vocab_embed = {}

    l = len(words_all)
    for i in range(l):
        word_i = words_all[i]
        embed_i = embed_all[i]
        bow_i = BOW_all[i]
        w = len(word_i)
        for j in range(w):
            if type(word_i[j]) == str:
                if word_i[j] not in vocab:
                    vocab.append(word_i[j])
                    vocab_embed[word_i[j]] = embed_i[:, j]
            else:
                break

    vocab_BOW = np.zeros((l, len(vocab)), dtype=np.int)

    l = len(words_all)
    for i in range(l):
        word_i = words_all[i]
        bow_i = BOW_all[i]

        w = len(word_i)
        words_idx = []
        for j in range(w):
            if type(word_i[j]) == str:
                words_idx.append(vocab.index(word_i[j]))
            else:
                break

        vocab_BOW[i, words_idx] = bow_i.astype(np.int)

    ####################################################
    # Use GLOVE word embeddings
    if glove_embeddings:
        vocab, vocab_embed, vocab_BOW = change_embeddings(
            vocab, vocab_BOW, embeddings_path)
    # Reduce vocabulary by removing short words, stop words, and stemming
    if stemming:
        vocab, vocab_embed, vocab_BOW = reduce_vocab(
            vocab_BOW, vocab, vocab_embed, embed_aggregate='mean')


    ####################################################
    return vocab_BOW, y_all-1




In [None]:
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split

# Download datasets used by Kusner et al from
# https://www.dropbox.com/sh/nf532hddgdt68ix/AABGLUiPRyXv6UL2YAcHmAFqa?dl=0
# and put them into
data_path = './data/'

# Download GloVe 6B tokens, 300d word embeddings from
# https://nlp.stanford.edu/projects/glove/
# and put them into
embeddings_path = './glove/glove.6B.300d.txt'
data = loader(data_path, embeddings_path)

In [None]:
from scipy.stats import wasserstein_distance
def wmd(u,v): ### u and v take two probability distributions; weight perhaps take NBOW

   return wasserstein_distance(u,v)
def wmdT20(u,v): 
    idx_u = np.argsort(u)
    idx_v = np.argsort(v)
    u_t20 = np.zeros(len(idx_u[:20]))
    v_t20 = np.zeros(len(idx_v[:20]))
    k = 0
    n = 0
    
    for i in idx_u[:20]:
        u_t20[k] = u[i]

        k+=1
    for ii in idx_v[:20]:
        v_t20[n] = v[ii]

        n+=1

    return wasserstein_distance(u_t20,v_t20)

In [None]:
bow_train, bow_test, y_train, y_test = train_test_split(vocab_BOW, y)
nbow_train, nbow_test = normalize(bow_train, 'l1'), normalize(bow_test, 'l1')

In [None]:
import operator
import numpy as np
from collections import Counter
import operator


In [69]:
# build classifer
# -*- coding: utf-8 -*-

# -*- coding: utf-8 -*-

def knnclassify(test,x_train,y_train,D,k,method):
    distances=[]
    for x in range(len(x_train)):
        if method == HOFTT:
            dist = method(test, x_train[x],D)
        else:
            dist = method(test, x_train[x])
        
        distances.append(dist)
  

    neighbor_inx=np.argsort(distances)[:k]
    neighbors = []
    for x in range(len(neighbor_inx)):
        
        neighbors.append(y_train[neighbor_inx[x]])  
    
#    classvote={}
#    nclass = np.unique(neighbors)
#    for i in range(len(nclass)):
#        temp=neighbors.count(nclass[i])
#        classvote.append(temp)
#        
#    predict=np.argmax(classvote)
#    
    most_common = Counter(neighbors).most_common(1)
    return most_common[0][0]

def myknn(k,x_train,y_train,x_test,y_test,D,method):
    y_pred=[]
    tempcount = 0
    for i in range(len(x_test)):
        ypred_t=knnclassify(x_test[i],x_train,y_train,D,k,method)
        y_pred.append(ypred_t)
        tempcount = tempcount+1
        print("count: ", tempcount)
    wait = input("PRESS ENTER TO CONTINUE.")
    count=0
    for i in range(len(y_test)):
        if y_pred[i]==y_test[i]:
            count=count+1
            
            
    test_error=1-count/len(y_test)
    
    return test_error
        






#create own knn with our designed distance
#k is neighbors number

In [None]:
# Compute test error
test_error = myknn(7,nbow_train, y_train,nbow_test, y_test,0, wmdT20)
print(test_error)