In [32]:
import csv
import numpy as np

from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences

# One hot
# from keras.utils.np_utils import to_categorical

# Layers/model
from keras.models import Sequential
from keras.layers import (Dense,
                          Embedding,
                          CuDNNLSTM,
                          CuDNNGRU,
                          GRU,
                          Bidirectional,
                          Dropout,
                          MaxPooling1D,
                          Conv1D,
                          GlobalAveragePooling1D,
                          MaxPooling1D,
                          Flatten,
                         )

from clr_callback import CyclicLR

In [2]:
def read_csv(path):
    samples = []
    with open(path, encoding='utf8') as csvfile:
        spamreader = csv.reader(csvfile)
        for row in spamreader:
            samples.append(row[0])

    return samples

In [3]:
pos_samples = read_csv('C:/Users/ranet/Documents/DATA/Datasets/farm_not_farm_text_tagging/farmstuff.csv')
neg_samples = read_csv('C:/Users/ranet/Documents/DATA/Datasets/farm_not_farm_text_tagging/notfarmstuff.csv')

In [4]:
print(len(pos_samples), len(neg_samples))

79774 100001


In [5]:
print(pos_samples[1][0:100])

"Eesti Loomakaitse Selts soovib veidi selgitada rahvusvahelise organisatsiooni Born Free Foundation 


In [6]:
print(neg_samples[1][0:100])

Mul ei ole nüüd juba tükk aega väikest beebit olnud.Märkasin seda, kui mulle saabus spontaanselt kül


In [7]:
# Average word counts
POS_AVG = np.mean([len(x.split(' ')) for x in pos_samples])
NEG_AVG = np.mean([len(x.split(' ')) for x in neg_samples])
print(POS_AVG, NEG_AVG)

511.35455160829343 235.23286767132328


In [8]:
# Combine both, add classes
pos_y = [1 for x in range(len(pos_samples))]
neg_y = [0 for x in range(len(neg_samples))]
print(len(pos_y), len(neg_y))

X = pos_samples + neg_samples
y = pos_y + neg_y
print(len(X), len(y))

79774 100001
179775 179775


In [9]:
MAX_VOCAB_SIZE = 50000
MAX_SEQ_LEN = 150

# Declare Keras Tokenizer
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE,
                       filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n12345687890',
                       lower=True,
                       split=" ",
                       char_level=False,
                       oov_token="X")

# Build Tokenizer on training vocab
tokenizer.fit_on_texts(X)
# Tokenize sequences from words to integers
X_tokenized = tokenizer.texts_to_sequences(X)
# Pad sequence to match MAX_SEQ_LEN
X_tokenized = pad_sequences(X_tokenized, maxlen=MAX_SEQ_LEN)
# Index to word
index_to_word = dict(map(reversed, tokenizer.word_index.items()))

print(len(tokenizer.word_index))
print(X[0])
print(X_tokenized[0])

2615606
body
[    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0 27596]


In [10]:
tokenizer.word_index['tartus']

1057

In [11]:
X[1]

'"Eesti Loomakaitse Selts soovib veidi selgitada rahvusvahelise organisatsiooni Born Free Foundation ja ENDCAP võrgustiku üle-euroopalise loomaaedade olukorra uuringu “EU Zoo Inquiry 2011” eesmärke," teatas seltsi juhatuse liige Evelyn Valtin.\\n\\n"Tallinna loomaaia reaktsioon avaldatud raportile on mõneti arusaadav, sest keegi ei soovi, et neid kritiseeritakse. Samas on raportis mitmes kohas Tallinna loomaaeda toodud välja positiivsest küljest, näiteks on Tallinna loomaaed Eestis tegutevatest loomaaedadest ainus, kes tegeleb liigikaitsega ning annab suurima panuse elustiku mitmekesisuse säilitamisse."\\n\\nLoomakaitse selts juhib tähelepanu nendele punktidele, mis on aruandes loomaaedadega seoses Eesti kohta välja toodud ning millega tuleks Eestis tegelda. \\n\\n"Mis puutub loomaaedade alasesse regulatsiooni, siis tuleks loomakaitseseaduses sätestada kohustuslikuna nõue, et loomaaiad peavad andma panuse liigikaitsesse. See käib loomaaedade kohta, kus eksponeeritakse looduses elavaid 

In [12]:
index_to_word[5545]

'nõuete'

In [13]:
X_tokenized[1]

array([    1,  4353,     1,     3,   577,     1,  2792,     2,     1,
          75,    11,  1839,  3084, 36894, 16908, 49958,  4244,     3,
         113,     1,     1,     1, 13089,  2700,  1390,  1686,    76,
           4,  9230,  5950,   113,   305,     1,  5277,    10,    76,
           3,    15,  1150,  9442,     1,  2970,  2703,  5277,  8867,
          11,   300,    41,    53,   126, 14788, 10831,     2,     2,
          10,  6566, 24989,   367, 14533,  7971,    76,     4,  9230,
          40,   641,   113,   305,     1,  4031, 22862,  2341,    76,
           4,   344, 21889,    57,  5545,  7329,    38, 38858,   504,
           1, 35458,  6500,  1416,     9,     1, 14064,   363,     9,
          76,  8867,     1,  2033,  5277,     1,     2,     2,  9206,
          21, 11048,     1, 11937, 47673, 35079,  3768,     8,   589,
           3,  1086,    93,   589, 35080,     4,  2457,  4413,   589,
       30745,  9005,     5,   564,    10,     1, 30450,  2990,     4,
           8,    21,

In [14]:
## One-hot encode labels?
# print(y[0:3])
# y_oh = to_categorical(y, 2)
# print(y_oh[0:3])


In [15]:
X_train, X_val, y_train, y_val = train_test_split(X_tokenized, y, test_size=0.33, random_state=42)

## MODELS

In [16]:
# Crop training data for faster prototyping
TEST_CROP = 10000

### Embedding to Dense

In [62]:
embed_dim = 64
lstm_out = 32

model = Sequential()
model.add(Embedding(MAX_VOCAB_SIZE, embed_dim, input_length=MAX_SEQ_LEN))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_22 (Embedding)     (None, 150, 64)           3200000   
_________________________________________________________________
flatten_2 (Flatten)          (None, 9600)              0         
_________________________________________________________________
dense_20 (Dense)             (None, 32)                307232    
_________________________________________________________________
dense_21 (Dense)             (None, 1)                 33        
Total params: 3,507,265
Trainable params: 3,507,265
Non-trainable params: 0
_________________________________________________________________
None


In [63]:
num_epochs = 5
batch_size = 128

# CLR https://arxiv.org/pdf/1506.01186.pdf
# From https://github.com/bckenstler/CLR/blob/master/clr_callback_tests.ipynb
clr_triangular = CyclicLR(mode='triangular')

model.fit(X_train, y_train, 
          epochs=num_epochs,
          batch_size=batch_size,
          verbose=1,
          validation_data=(X_val, y_val),
          callbacks=[clr_triangular])

Train on 120449 samples, validate on 59326 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5

KeyboardInterrupt: 

### CuDNNLSTM

In [24]:
embed_dim = 64
lstm_out = 32

model = Sequential()
model.add(Embedding(MAX_VOCAB_SIZE, embed_dim, input_length=MAX_SEQ_LEN, dropout=0.1))
model.add(CuDNNLSTM(lstm_out))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 150, 64)           3200000   
_________________________________________________________________
cu_dnnlstm_3 (CuDNNLSTM)     (None, 32)                12544     
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 33        
Total params: 3,212,577
Trainable params: 3,212,577
Non-trainable params: 0
_________________________________________________________________
None


  """


In [25]:
num_epochs = 5
batch_size = 128

clr_triangular = CyclicLR(mode='triangular')

model.fit(X_train, y_train, 
          epochs=num_epochs,
          batch_size=batch_size,
          verbose=1,
          validation_data=(X_val, y_val),
          callbacks=[clr_triangular])

Train on 120449 samples, validate on 59326 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x20135aa3da0>

### CuDNNGRU

In [45]:
embed_dim = 64
lstm_out = 32

model = Sequential()
model.add(Embedding(MAX_VOCAB_SIZE, embed_dim, input_length=MAX_SEQ_LEN))
model.add(CuDNNGRU(lstm_out,))

model.add(Dense(1,activation='sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_14 (Embedding)     (None, 150, 64)           3200000   
_________________________________________________________________
cu_dnngru_9 (CuDNNGRU)       (None, 32)                9408      
_________________________________________________________________
dense_13 (Dense)             (None, 1)                 33        
Total params: 3,209,441
Trainable params: 3,209,441
Non-trainable params: 0
_________________________________________________________________
None


In [46]:
num_epochs = 5
batch_size = 128

clr_triangular = CyclicLR(mode='triangular')

model.fit(X_train, y_train, 
          epochs=num_epochs,
          batch_size=batch_size,
          verbose=1,
          validation_data=(X_val, y_val),
          callbacks=[clr_triangular])

Train on 120449 samples, validate on 59326 samples
Epoch 1/5
  9216/120449 [=>............................] - ETA: 3:08 - loss: 0.6346 - acc: 0.5983

KeyboardInterrupt: 

### Stacked CuDNNGRU

In [37]:
embed_dim = 64

model = Sequential()
model.add(Embedding(MAX_VOCAB_SIZE, embed_dim, input_length=MAX_SEQ_LEN))

model.add(CuDNNGRU(32, return_sequences=True))
model.add(CuDNNGRU(64, activation="relu")

model.add(Dense(1,activation='sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 150, 64)           3200000   
_________________________________________________________________
cu_dnngru_4 (CuDNNGRU)       (None, 150, 32)           9408      
_________________________________________________________________
cu_dnngru_5 (CuDNNGRU)       (None, 64)                18816     
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 65        
Total params: 3,228,289
Trainable params: 3,228,289
Non-trainable params: 0
_________________________________________________________________
None


In [38]:
num_epochs = 5
batch_size = 128

clr_triangular = CyclicLR(mode='triangular')

model.fit(X_train, y_train, 
          epochs=num_epochs,
          batch_size=batch_size,
          verbose=1,
          validation_data=(X_val, y_val),
          callbacks=[clr_triangular])

Train on 120449 samples, validate on 59326 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

KeyboardInterrupt: 

### CuDNNGRU Bidirectional

In [51]:
embed_dim = 64
lstm_out = 64

model = Sequential()
model.add(Embedding(MAX_VOCAB_SIZE, embed_dim, input_length=MAX_SEQ_LEN))
model.add(Bidirectional(CuDNNGRU(lstm_out,)))

model.add(Dense(1,activation='sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_17 (Embedding)     (None, 150, 64)           3200000   
_________________________________________________________________
bidirectional_3 (Bidirection (None, 128)               49920     
_________________________________________________________________
dense_16 (Dense)             (None, 1)                 129       
Total params: 3,250,049
Trainable params: 3,250,049
Non-trainable params: 0
_________________________________________________________________
None


In [52]:
num_epochs = 5
batch_size = 128

clr_triangular = CyclicLR(mode='triangular')

model.fit(X_train, y_train, 
          epochs=num_epochs,
          batch_size=batch_size,
          verbose=1,
          validation_data=(X_val, y_val),
          callbacks=[clr_triangular])

Train on 120449 samples, validate on 59326 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5

KeyboardInterrupt: 

### Conv1D

In [56]:
embed_dim = 128

model = Sequential()
model.add(Embedding(MAX_VOCAB_SIZE, embed_dim, input_length=MAX_SEQ_LEN))
model.add(Conv1D(32, 7, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Conv1D(32, 7, activation='relu'))
model.add(GlobalAveragePooling1D())
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [57]:
num_epochs = 5
batch_size = 128

model.fit(X_train, y_train, 
          epochs=num_epochs,
          batch_size=batch_size,
          verbose=1,
          validation_data=(X_val, y_val))

Train on 120449 samples, validate on 59326 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5

KeyboardInterrupt: 

### Conv1D+CuDNNGRU

In [60]:
embed_dim = 128

model = Sequential()
model.add(Embedding(MAX_VOCAB_SIZE, embed_dim, input_length=MAX_SEQ_LEN))
model.add(Conv1D(32, 7, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Conv1D(32, 7, activation='relu'))
model.add(CuDNNGRU(32))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [61]:
num_epochs = 5
batch_size = 128

clr_triangular = CyclicLR(mode='triangular')

model.fit(X_train, y_train, 
          epochs=num_epochs,
          batch_size=batch_size,
          verbose=1,
          validation_data=(X_val, y_val),
          callbacks=[clr_triangular])

Train on 120449 samples, validate on 59326 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5

KeyboardInterrupt: 

### Conv1D+CuDNNLSTM

In [45]:
embed_dim = 300
lstm_out = 32

model_3 = Sequential()
model_3.add(Embedding(MAX_VOCAB_SIZE, embed_dim, input_length=MAX_SEQ_LEN))
model_3.add(Dropout(0.25))
model_3.add(Conv1D(64, 5, padding='valid', activation='relu', strides=1))
model_3.add(MaxPooling1D(pool_size=4))
model_3.add(CuDNNLSTM(70))
model_3.add(Dropout(0.3))
model_3.add(Dense(20))
model_3.add(Dense(1, activation='sigmoid'))
model_3.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [46]:
num_epochs = 5
batch_size = 64

model_3.fit(X_train, y_train, 
          epochs=num_epochs,
          batch_size=batch_size,
          verbose=1,
          validation_data=(X_val, y_val))

Train on 120449 samples, validate on 59326 samples
Epoch 1/5
Epoch 2/5
   256/120449 [..............................] - ETA: 34:13 - loss: 0.1998 - acc: 0.8984

KeyboardInterrupt: 

### Big LSTM Stacked

In [95]:
# expected input data shape: (batch_size, timesteps, data_dim)
model_4 = Sequential()
model_4.add(Embedding(MAX_VOCAB_SIZE, embed_dim, input_length=MAX_SEQ_LEN, dropout=0.1))
model_4.add(CuDNNLSTM(32, return_sequences=True))
model_4.add(CuDNNLSTM(32, return_sequences=True))
model_4.add(CuDNNLSTM(32))
model_4.add(Dense(1, activation='sigmoid'))

model_4.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

  This is separate from the ipykernel package so we can avoid doing imports until


In [96]:
num_epochs = 5
batch_size = 128

model_4.fit(X_train[:TEST_CROP], y_train[:TEST_CROP], 
          epochs=num_epochs,
          batch_size=batch_size,
          verbose=1,
          validation_data=(X_val, y_val))

Train on 10000 samples, validate on 59326 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x242d3c8c278>

### FNN

In [110]:
embed_dim = 64

model_5 = Sequential()
model_5.add(Embedding(MAX_VOCAB_SIZE, embed_dim, input_length=MAX_SEQ_LEN))
model_5.add(Flatten())
model_5.add(Dense(256)),
model_5.add(Dropout(0.3))
model_5.add(Dense(128)),
model_5.add(Dropout(0.3))
model_5.add(Dense(25)),
model_5.add(Dense(1))

model_5.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
model_5.fit(X_train[:TEST_CROP], y_train[:TEST_CROP], 
          epochs=num_epochs,
          batch_size=batch_size,
          verbose=1,
          validation_data=(X_val, y_val))

### Sklearn models

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

text_clf = Pipeline([('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-5, random_state=42,
                                           max_iter=25, tol=None)),])

text_clf3 = Pipeline([('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB())])

In [18]:
text_clf.fit(X_train, y_train)  
predicted = text_clf.predict(X_val)
np.mean(predicted == np.asarray(y_val))

0.6071536931530863

In [19]:
text_clf3.fit(X_train, y_train)  
predicted = text_clf.predict(X_val)
np.mean(predicted == np.asarray(y_val))

0.6071536931530863

In [20]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-5, random_state=42,
                                           max_iter=25, tol=None)),])


text_clf3 = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB())])

In [22]:
X_t_train, X_t_val, y_t_train, y_t_val = train_test_split(X, y, test_size=0.33, random_state=42)

In [25]:
text_clf.fit(X_t_train[:10000], y_t_train[:10000])  
predicted = text_clf.predict(X_t_val)
print(np.mean(predicted == np.asarray(y_t_val)))


text_clf3.fit(X_t_train[:10000], y_t_train[:10000])  
predicted = text_clf.predict(X_t_val)
print(np.mean(predicted == np.asarray(y_t_val)))

0.8688096281562889
0.8688096281562889


### Hyperas test

In [65]:
# For automatic hyperparameter tuning
# http://maxpumperla.com/hyperas/

In [74]:
!pip install hyperas

Collecting hyperas
  Downloading https://files.pythonhosted.org/packages/54/72/5533b6bf9b47dc33685c3e62c391d6eab5785a648a5ffa841e240a3db3fe/hyperas-0.4.tar.gz
Collecting hyperopt (from hyperas)
  Downloading https://files.pythonhosted.org/packages/63/12/704382c3081df3ae3f9d96fe6afb62efa2fa9749be20c301cd2797fb0b52/hyperopt-0.1.2-py3-none-any.whl (115kB)
Collecting pymongo (from hyperopt->hyperas)
  Downloading https://files.pythonhosted.org/packages/d8/25/44b0fc81668a883739b108d9bd0c95b24f0b0204cb2dc93e0f259e173670/pymongo-3.7.2-cp36-cp36m-win_amd64.whl (315kB)
Collecting future (from hyperopt->hyperas)
  Downloading https://files.pythonhosted.org/packages/90/52/e20466b85000a181e1e144fd8305caf2cf475e2f9674e797b222f8105f5f/future-0.17.1.tar.gz (829kB)
Collecting tqdm (from hyperopt->hyperas)
  Downloading https://files.pythonhosted.org/packages/6c/4b/c38b5144cf167c4f52288517436ccafefe9dc01b8d1c190e18a6b154cd4a/tqdm-4.31.1-py2.py3-none-any.whl (48kB)
Building wheels for collected packages

pexpect 4.5.0 requires ptyprocess>=0.5, which is not installed.
distributed 1.21.8 requires msgpack, which is not installed.
You are using pip version 10.0.1, however version 19.0.3 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [75]:
from hyperopt import Trials, STATUS_OK, tpe
from hyperas import optim
from hyperas.distributions import choice, uniform, conditional

In [80]:
def data():
    ## IMPORTS
    import csv
    import numpy as np
    from sklearn.model_selection import train_test_split
    from keras.preprocessing.text import Tokenizer, text_to_word_sequence
    from keras.preprocessing.sequence import pad_sequences
    
    ## DATA LOADING
    # Crop data for faster prototyping
    CROP = 1000
    def read_csv(path):
        samples = []
        with open(path, encoding='utf8') as csvfile:
            spamreader = csv.reader(csvfile)
            for row in spamreader:
                samples.append(row[0])
    return samples
    pos_samples = read_csv('C:/Users/ranet/Documents/DATA/Datasets/farm_not_farm_text_tagging/farmstuff.csv')[:CROP]
    neg_samples = read_csv('C:/Users/ranet/Documents/DATA/Datasets/farm_not_farm_text_tagging/notfarmstuff.csv')[:CROP]
    
    ## MAKE CLASSES
    # Combine both, add classes
    pos_y = [1 for x in range(len(pos_samples))]
    neg_y = [0 for x in range(len(neg_samples))]
    X = pos_samples + neg_samples
    y = pos_y + neg_y


    ## TOKENIZER
    MAX_VOCAB_SIZE = 50000
    MAX_SEQ_LEN = 150
    # Declare Keras Tokenizer
    tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE,
                           filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n12345687890',
                           lower=True,
                           split=" ",
                           char_level=False,
                           oov_token="X")

    # Build Tokenizer on training vocab
    tokenizer.fit_on_texts(X)
    # Tokenize sequences from words to integers
    X_tokenized = tokenizer.texts_to_sequences(X)
    # Pad sequence to match MAX_SEQ_LEN
    X_tokenized = pad_sequences(X_tokenized, maxlen=MAX_SEQ_LEN)
    
    ## SPLIT DATA
    X_train, X_val, y_train, y_val = train_test_split(X_tokenized, y, test_size=0.33, random_state=42)
    
    
    return X_train[:hyper_crop], y_train[:hyper_crop], X_val[:hyper_crop], y_val[:hyper_crop]

In [81]:
def model(X_train, Y_train, X_test, Y_test):
    from keras.models import Sequential
    from keras.layers import Dense, Embedding, CuDNNLSTM, CuDNNGRU, Dropout
    
    embed_dim = 64
    lstm_out = 32

    model = Sequential()
    model.add(Embedding(MAX_VOCAB_SIZE, embed_dim, input_length=MAX_SEQ_LEN))
    model.add({{choice([CuDNNGRU( {{ choice([8, 32, 64]) }} ), CuDNNLSTM( {{ choice([8, 32, 64]) }} )])}})
    model.add()
    
    if conditional({{choice('one', 'two', 'three')}}) == 'two':
        model.add(Dense({{choice([5, 20, 50, 100])}}), activation="relu")
    elif conditional({{choice('one', 'two', 'three')}}) == 'three':
        model.add(Dense({{choice([20, 50, 100])}}), activation="relu")
        model.add(Dropout({{ uniform(0, 1) }}))
        model.add(Dense({{choice([5, 20])}}), activation="relu")

    model.add(Dense(1,activation='sigmoid'))
    model.compile(loss = 'binary_crossentropy', optimizer={{ choice(['rmsprop', 'adam', 'sgd']) }})
    model.fit(X_train, Y_train,
          batch_size={{choice([32, 64, 128])}},
          nb_epoch=2,
          show_accuracy=True,
          verbose=2,
          validation_data=(X_test, Y_test))
    score, acc = model.evaluate(X_test, Y_test, show_accuracy=True, verbose=0)
    print('Test accuracy:', acc)

    return {'loss': -acc, 'status': STATUS_OK, 'model': model}

In [82]:
best_run, best_model = optim.minimize(model=model,
                                          data=data,
                                          algo=tpe.suggest,
                                          max_evals=5,
                                          trials=Trials(),
                                          notebook_name='data_processing')

X_train, Y_train, X_test, Y_test = data()
print("Evalutation of best performing model:")
print(best_model.evaluate(X_test, Y_test))

UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 6850: character maps to <undefined>