In [1]:
from window_based_tagger_config import get_config
from load_data import load_process_essays

from featureextractortransformer import FeatureExtractorTransformer
from sent_feats_for_stacking import *
from load_data import load_process_essays, extract_features

from featurevectorizer import FeatureVectorizer
from featureextractionfunctions import *
from wordtagginghelper import *
from IterableFP import flatten

import Settings
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logger = logging.getLogger()

# Create persister (mongo client) - fail fast if mongo service not initialized
# not hashed as don't affect persistence of feature processing
SPARSE_WD_FEATS     = True
SPARSE_SENT_FEATS   = True

MIN_FEAT_FREQ       = 5        # 5 best so far
CV_FOLDS            = 5

MIN_TAG_FREQ        = 5
LOOK_BACK           = 0     # how many sentences to look back when predicting tags
# end not hashed

# construct unique key using settings for pickling
settings = Settings.Settings()

folder = settings.data_directory + "CoralBleaching/BrattData/EBA_Pre_Post_Merged/"
config = get_config(folder)

tagged_essays = load_process_essays( **config )
logger.info("Essays loaded")

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/
213 files found
213 essays processed


In [5]:
def get_tags(essays):
    tags = set()
    for essay_ix, essay in enumerate(essays):
        for sent_ix, taggged_sentence in enumerate(essay.sentences):
            for word_ix, (wd, wd_tags) in enumerate(taggged_sentence):
                tags.update(wd_tags)
    return tags

In [6]:
tags = get_tags(tagged_essays)
concept_codes = sorted((tag for tag in tags if tag[0].isdigit()))
concept_codes

['1', '11', '12', '13', '14', '2', '3', '4', '5', '50', '5b', '6', '7']

In [7]:
from collections import defaultdict
from IdGenerator import IdGenerator as idGen

def get_xs_ys(essay_feats, tags):
    xs, ysByTag = [], defaultdict(list)
    id_gen = idGen()
    for essay_ix, essay in enumerate(essay_feats):
        for sent_ix, taggged_sentence in enumerate(essay.sentences):
            sent = []
            xs.append(sent)
            for word_ix, (wd, wd_tags) in enumerate(taggged_sentence):
                id_ = id_gen.get_id(wd)
                sent.append(id_)
                for tag in tags:
                    if word_ix == 0:
                        lbls = []
                        ysByTag[tag].append(lbls)
                    else:
                        lbls = ysByTag[tag][-1]
                    if tag in wd_tags:
                        lbls.append(1)
                    else:
                        lbls.append(0)
    return xs, ysByTag

In [10]:
xs, ysByTag = get_xs_ys(tagged_essays, concept_codes)

In [11]:
assert len(xs) == len(ysByTag['50'])
for i in range(len(xs)):
    x = xs[i]
    y = ysByTag["50"][i]
    assert len(x) == len(y)
len(xs), len(ysByTag["50"])

(2084, 2084)

In [12]:
from IterableFP import flatten
max_features = max(flatten(xs))
max_features

881

In [13]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Embedding
from keras.layers.recurrent import LSTM

model = Sequential()
model.add(Embedding(max_features, 256))
model.add(LSTM(256, 128, activation='sigmoid', inner_activation='hard_sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(128, 1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='rmsprop')

  import cuda_ndarray.cuda_ndarray
1 #define _CUDA_NDARRAY_C
2 
3 #include <Python.h>
4 #include <structmember.h>
5 
6 #include <numpy/arrayobject.h>
7 #include <iostream>
8 
9 #include "cuda_ndarray.cuh"
10 
11 //If true, when there is a gpu malloc or free error, we print the size of allocated memory on the device.
12 #define COMPUTE_GPU_MEM_USED 0
13 
14 //If true, we fill with NAN allocated device memory.
15 #define ALLOC_MEMSET 0
16 
17 //If true, we print out when we free a device pointer, uninitialize a
18 //CudaNdarray, or allocate a device pointer
19 #define PRINT_FREE_MALLOC 0
20 
21 //If true, we do error checking at the start of functions, to make sure there
22 //is not a pre-existing error when the function is called.
23 //You probably need to set the environment variable
24 //CUDA_LAUNCH_BLOCKING=1, and/or modify the CNDA_THREAD_SYNC
25 //preprocessor macro in cuda_ndarray.cuh
26 //if you want this to work.
27 #define PRECHECK_ERROR 0
28 
29 /////////////////////////
30 //


['nvcc', '-shared', '-g', '-O3', '-m64', '-Xcompiler', '-DCUDA_NDARRAY_CUH=d67f7c8a21306c67152a70a88a837011,-fPIC', '-Xlinker', '-rpath,/Users/simon.hughes/.theano/compiledir_Darwin-14.1.0-x86_64-i386-64bit-i386-2.7.3-64/cuda_ndarray', '-I/Users/simon.hughes/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/Theano-0.6.0-py2.7.egg/theano/sandbox/cuda', '-I/Users/simon.hughes/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/numpy/core/include', '-I/Applications/Canopy.app/appdata/canopy-1.2.0.1610.macosx-x86_64/Canopy.app/Contents/include/python2.7', '-o', '/Users/simon.hughes/.theano/compiledir_Darwin-14.1.0-x86_64-i386-64bit-i386-2.7.3-64/cuda_ndarray/cuda_ndarray.so', 'mod.cu', '-LNone/lib', '-lcublas', '-lcudart', '-L/Applications/Canopy.app/appdata/canopy-1.2.0.1610.macosx-x86_64/Canopy.app/Contents/lib/python2.7/config', '-ldl', '-lpython2.7', '/Library/Frameworks/Python.framework/Versions/2.0.0.dev1-1aeba78/Python', '-Xcompiler', '-framework,CoreFound

In [37]:
import keras.preprocessing.sequence
xpad = keras.preprocessing.sequence.pad_sequences(X_train, maxlen=None, dtype='int32')
ypad = keras.preprocessing.sequence.pad_sequences(Y_train, maxlen=None, dtype='int32')

In [38]:
print xpad.shape, ypad.shape

(2084, 85) (2084, 85)


In [41]:
from keras.datasets import imdb
max_features=20000
maxlen = 100 # cut texts after this number of words (among top max_features most common words)
batch_size = 16

print "Loading data..."
#(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features, test_split=0.2)

Loading data...
Downloading data from https://s3.amazonaws.com/text-datasets/imdb.pkl


In [54]:
imdb_X_train, imdb_y_train = X_train, y_train
imdb_X_text, imdb_y_test   = X_test, y_test

In [57]:
ix = 1
y_train[ix]

1

In [64]:
#X_train = xs
#Y_train = ysByTag["50"]
y_dummy = np.arange(len(X_train)) % 5
model.fit(imdb_X_train, imdb_y_train, batch_size=100, nb_epoch=10)
#score = model.evaluate(X_test, Y_test, batch_size=16)

Epoch 0


TypeError: list indices must be integers, not list

In [15]:
import theano 
print theano.config

floatX (('float64', 'float32')) 
    Doc:  Default floating-point precision for python casts
    Value:  float32

cast_policy (('custom', 'numpy+floatX')) 
    Doc:  Rules for implicit type casting
    Value:  custom

int_division (('int', 'raise', 'floatX')) 
    Doc:  What to do when one computes x / y, where both x and y are of integer types
    Value:  int

device (cpu, gpu*, opencl*, cuda*) 
    Doc:  Default device for computations. If gpu*, change the default to try to move computation to it and to put shared variable of float32 on it. Do not use upper case letters, only lower case even if NVIDIA use capital letters.
    Value:  gpu

gpuarray.init_device (<type 'str'>) 
    Doc:  
             Device to initialize for gpuarray use without moving
             computations automatically.
             
    Value:  

init_gpu_device (('', 'gpu', 'gpu0', 'gpu1', 'gpu2', 'gpu3', 'gpu4', 'gpu5', 'gpu6', 'gpu7', 'gpu8', 'gpu9', 'gpu10', 'gpu11', 'gpu12', 'gpu13', 'gpu14', 'gpu15')) 
   

In [31]:
import sys
sys.path.insert(0, "/Users/simon.hughes/GitHub")
sys.path.insert(0, "/Users/simon.hughes/GitHub/keras")
sys.path.insert(0, "/Users/simon.hughes/GitHub/keras/utils")

In [28]:
import keras.utils as utils
from utils.generic_utils import get_from_module

ImportError: No module named generic_utils

In [23]:
sorted(dir(utils))

['FakeDict',
 'HAS_PATTERN',
 'InputQueue',
 'NoCM',
 'PAT_ALPHABETIC',
 'RE_HTML_ENTITY',
 'RepeatCorpus',
 'SaveLoad',
 '__builtins__',
 '__doc__',
 '__file__',
 '__name__',
 '__package__',
 'any2unicode',
 'any2utf8',
 'cPickle',
 'chunkize',
 'chunkize_serial',
 'copytree_hardlink',
 'deaccent',
 'decode_htmlentities',
 'dict_from_corpus',
 'getNS',
 'get_max_id',
 'get_my_ip',
 'grouper',
 'identity',
 'is_corpus',
 'itertools',
 'lemmatize',
 'logger',
 'logging',
 'make_closing',
 'multiprocessing',
 'n2cp',
 'nocm',
 'os',
 'parse',
 'pickle',
 'pyro_daemon',
 'randfname',
 'random',
 're',
 'revdict',
 'shutil',
 'simple_preprocess',
 'smart_open',
 'synchronous',
 'tempfile',
 'to_unicode',
 'to_utf8',
 'tokenize',
 'toptexts',
 'traceback',
 'unicodedata',
 'unpickle',
 'upload_chunked',
 'with_statement',
 'wraps']

In [7]:
l = [[[1,2,3,4,5],[1,2,3,4,5],[1,2,3,4,5]], [[1,2,3,4,5],[1,2,3,4,5],[1,2,3,4,5]]]
import numpy as np
a = np.array(l)

In [8]:
a.shape

(2, 3, 5)

In [9]:
a.tolist()

[[[1, 2, 3, 4, 5], [1, 2, 3, 4, 5], [1, 2, 3, 4, 5]],
 [[1, 2, 3, 4, 5], [1, 2, 3, 4, 5], [1, 2, 3, 4, 5]]]