In [7]:
%load_ext autoreload
%autoreload 2

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
# uncomment these lines to download the NLTK packages
#nltk.download('punkt')
#nltk.download('wordnet')
#nltk.download('stopwords')
#nltk.download('omw-1.4')

import argparse
import glob
import gensim
import numpy as np
import pandas as pd
from os.path import exists
from typing import List, Dict
from collections import defaultdict
import matplotlib.pyplot as plt

from utils.data import load_data, get_word_tokenized_corpus, get_data_property, get_data_chunks, ABSTRACT, N_CITATION
from utils.embeddings import train_fasttext_embedding, pretrain_fasttext_embedding, get_chunk_embeddings, save_fasttext, load_fasttext
from utils.features import get_features
from utils.controls import get_controls


from main import setup_chunk_embeddings, setup

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
%reload_ext autoreload

In [5]:
# SET THIS FLAG IF IMPORTING EMBS FROM MATLAB
from_MATLAB = True
# SET THIS FLAG FOR THE MODE (train, load, or test)
mode = 'test'

if mode == 'train':
    # Train Model
    class Args:
    #     model_name = 'fasttext_model/ft_model.model'
        model_name = 'fasttext_model/cc.en.300.bin'
        data_file = 'data/dblp-ref-0.json'
        chunk_embs_file = 'none'
        proj_dir = './saved/'
        limit = 30000
        T = 20
        train_model = True

    args=Args()

    ft_model, abstracts, citation_counts = setup(args)
    chunk_embs = setup_chunk_embeddings(args, ft_model, abstracts)
elif mode == 'load':
    # Load Chunks
    class Args:
        model_name = 'fasttext_model/cc.en.300.bin'
        data_file = 'data/dblp-ref-0.json'
        chunk_embs_file = 'data/chunk_embs.txt'
        proj_dir = './saved/'
        limit = 30000
        T = 20
        train_model = False

    args=Args()
elif mode == 'test':
    # Test sentences
    # Load Chunks
    class Args:
        model_name = 'fasttext_model/wiki-news-300d-1M.vec'
        proj_dir = './saved/'
        T = 20
        train_model = False

    args=Args()
    ft_model = load_fasttext(args.proj_dir + args.model_name)

if from_MATLAB and mode != 'test':
    df = pd.read_table('./saved/data/toubia_embs.txt', dtype=float, header=None, sep=',').fillna(0).values
    chunk_embs = [np.trim_zeros(df[i], 'b').reshape(300,-1).transpose() for i in range(len(df))]
elif mode != 'test':
    chunk_embs = setup_chunk_embeddings(args, None, None)

In [19]:

args.data_file = './yelp_saved/yelpdata_10k.txt'
args.proj_dir = './yelp_saved/'
args.limit = -1
data = load_data(args)
documents = get_data_property(data, ABSTRACT)

print('Chunking...')
chunks = [get_data_chunks(document, chunk_len=3, mode='chunk_len') for document in documents[:args.limit]]
print('Embedding...')
chunk_embs = np.array([get_chunk_embeddings(ft_model, chunk) for chunk in chunks])
print('Computing Features...')
features = [get_features(np.stack(chunk_emb)) if len(chunk_emb) > 1 else {} for chunk_emb in chunk_embs]
    
#     for key in features[0].keys():
#         print('\n' + key)
        
#         if key == 'distances':
#             for i, feature in enumerate(features):
#                 plt.plot(range(len(feature['distances'])), feature['distances'], label=f'Sentence{i}')
#             plt.legend()

#         else:
#             for feature in features:
#                 print(feature[key])

Loading TXT...
Chunking...
Embedding...


  chunk_embs = np.array([get_chunk_embeddings(ft_model, chunk) for chunk in chunks])


Computing Features...


  step_size = (maximum - d - 1.0) / ((d + 1.0) * (maximum - 1.0))
  return np.exp(log_a.mean(axis=axis))
  ret = ret.dtype.type(ret / rcount)
  return np.log(distance_covered / tsp)


Writing output...


TypeError: list indices must be integers or slices, not str

In [21]:
print('Writing output...')
with open('yelpdata_speeds.txt', 'w') as f:
    for i, feature in enumerate(features):
        if not len(feature):
            continue
        speed = feature['speed']
        f.write(data[i]['abstract'].strip() + '\t' + str(speed) + '\n')

Writing output...


In [33]:
features = [get_features(chunk_emb, feature_list=['circuitousness']) for chunk_emb in chunk_embs]

[[-9.78689951e-03  3.30480348e-03 -1.81314411e-02 ...  1.36876420e-01
   8.49039270e-03 -1.56296948e-02]
 [-1.29661071e-04 -3.06440671e-03 -1.05334746e-02 ...  1.30048305e-01
   2.51627121e-02 -1.86733055e-02]
 [ 1.48067164e-02 -1.42041043e-02 -2.28097024e-03 ...  1.06595523e-01
   5.02089567e-03 -6.82910485e-03]
 ...
 [ 1.50881856e-02  7.93417710e-03 -2.96333337e-02 ...  8.45185663e-02
   1.61151902e-02 -1.60881855e-02]
 [-2.57022904e-03  1.13381681e-02 -1.46324432e-02 ...  8.37041993e-02
   8.73816792e-03 -2.95305355e-03]
 [ 1.25537672e-02 -1.10582191e-02 -3.59301370e-02 ...  1.59184590e-01
   2.17383562e-02 -2.25869866e-02]]
[ 0  1  2 25 27 26 28 29 18 31 30 15 24 16 17 13 14 23  8 22  6  7  3  4
  5 21 19 32  9 11 20 12 10]
[[ 0.00944241  0.01239286 -0.00127589 ...  0.102075    0.01228438
  -0.00957589]
 [ 0.0086363   0.0076163  -0.00077889 ...  0.13046593  0.02446111
  -0.01562926]
 [ 0.00920984 -0.00067254  0.01364262 ...  0.1131623   0.01469754
  -0.01412582]
 ...
 [ 0.03650433 

KeyboardInterrupt: 

In [None]:
features