In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm

import lingualTF as la

import tensorflow as tf
#import keras

from keras.models import Sequential
from keras.layers import Convolution1D, AveragePooling1D, AveragePooling1D
from keras.layers import Activation, Dropout, Flatten, Dense, BatchNormalization

[nltk_data] Downloading package wordnet to /Users/Seth/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package maxent_treebank_pos_tagger to
[nltk_data]     /Users/Seth/nltk_data...
[nltk_data]   Package maxent_treebank_pos_tagger is already up-to-
[nltk_data]       date!


Using TensorFlow backend.


In [8]:
####################################
# REPLACE THESE WITH CORRECT PATHS #
####################################
docRanks = pd.read_csv('/Users/Seth/Documents/DSI/Capstone/DSI-Religion-2017/refData/docRanks.csv')
#docRanks.head()
docRanks = docRanks.ix[~((docRanks['groupName']=='YV03')|(docRanks['groupName']=='YV04'))]
docRanks.reset_index(inplace=True, drop=True)
docRanks.tail()

Unnamed: 0,groupName,rank
273,WBC347,2
274,WBC410,1
275,WBC418,1
276,WBC421,1
277,WBC422,1


In [9]:
def DSIacc(y_true, y_pred):
    return float(len([i for i in range(len(y_pred)) if abs(y_true[i][0]-y_pred[i][0])<=1])/float(len(y_pred)))

## get data from file (processed in earlier scripts)

In [10]:
# OR load X matrix from disk (already processed)
X = np.load('/Users/Seth/Documents/DSI/Capstone/big-data/X-single-1k-278.npy')
X.shape

(278, 500, 1002)

In [11]:
# construct Y matrix
Y = np.array([np.array([y]) for y in docRanks['rank']])
Y.shape

(278, 1)

In [12]:
# make a Y vector for classification
Y_cat = np.zeros((len(Y),9))

for i in range(0,len(Y)):
    Y_cat[i][Y[i]-1] = Y[i]

Y_cat.shape

(278, 9)

In [13]:
X[0]

array([[1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0]])

## shuffle the data into training and testing

In [20]:
# set the seed if you want to
np.random.seed(123)
#
shuf = docRanks.sample(frac=1).index
shuf

Int64Index([ 72,   4,  79, 171, 112,  71,  90,  93, 182, 147,
            ...
             96,  57, 123, 106,  83,  17,  98,  66, 126, 109],
           dtype='int64', length=193)

In [21]:
splitPoint = int(len(X) * .8)
#train = docRanks.iloc[shuf[:splitPoint]]
#test = docRanks.iloc[shuf[(splitPoint+1):]]
splitPoint

154

In [22]:
X_train = X[shuf[:splitPoint]]
Y_train = Y[shuf[:splitPoint]]
Y_cat_train = Y_cat[shuf[:splitPoint]]

X_test = X[shuf[(splitPoint+1):]]
Y_test = Y[shuf[(splitPoint+1):]]
Y_cat_test = Y_cat[shuf[(splitPoint+1):]]

## regression model

In [23]:
model = Sequential()
model.add(Convolution1D(64, 3, border_mode='same', input_shape=(X.shape[1],X.shape[2])))
model.add(BatchNormalization())
model.add(Activation('relu'))
#model.add(AveragePooling1D(pool_length=3))

model.add(Convolution1D(128, 7, border_mode='same'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(AveragePooling1D(pool_length=3))

model.add(Convolution1D(192, 3, border_mode='same'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(AveragePooling1D(pool_length=3))

model.add(Convolution1D(256, 3, border_mode='same'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(AveragePooling1D(pool_length=3))


In [24]:
model.add(Flatten())  # this converts our 3D feature maps to 1D feature vectors
model.add(Dense(4096, init='normal'))
model.add(BatchNormalization())
model.add(Activation('relu'))

#model.add(Dense(4096, init='normal'))
model.add(Dense(1000, init='normal'))
model.add(BatchNormalization())
model.add(Activation('relu'))

#model.add(Dense(1000, init='normal'))
model.add(Dense(200, init='normal'))
model.add(BatchNormalization())
model.add(Activation('relu'))

In [25]:
#
#model.add(Dense(9, activation='softmax')) # for categorical_cross_entropy, below (classification)
model.add(Dense(1)) # for mse, below (regression)

In [26]:
def DSIacc(y_true, y_pred):
    return float(len([i for i in range(len(y_pred)) if abs(y_true[i][0]-y_pred[i][0])<=1])/float(len(y_pred)))

In [27]:
#model.compile(loss='categorical_cross_entropy', optimizer='rmsprop', metrics=['accuracy'])
model.compile(loss='mean_squared_error', optimizer='sgd', metrics=['mean_absolute_error'])

### Batch size mattered a lot on this one
with `batch_size=1` we got huge numbers and 0 accuracy, with `batch_size=154` we got tiny numbers and 23% accuracy (basically all the 1's and maybe a few 2's). With `batch_size=32` we did a little better, but still not convincing...

In [28]:
#model.fit(X_train, Y_train, nb_epoch=10, batch_size=1)
#model.fit(X_train, Y_train, nb_epoch=10, batch_size= 154)
model.fit(X_train, Y_train, nb_epoch=10, batch_size=32) # 32 did way better than 1 on regression (opposite of classification)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2e0ed6610>

### evaluate

In [29]:
model.evaluate(X_test, Y_test, batch_size=1)



[6.7362385999754464, 1.9637867494633323]

In [30]:
y_pred = model.predict(X_test)
y_pred[:10]

array([[ 1.50487661],
       [ 2.2623558 ],
       [ 1.64483905],
       [ 2.32016873],
       [ 2.46636415],
       [ 2.67332745],
       [ 1.48190165],
       [ 2.11560202],
       [ 4.07303762],
       [ 1.78270376]], dtype=float32)

In [31]:
Y_test[:10]

array([[4],
       [3],
       [1],
       [3],
       [3],
       [5],
       [2],
       [4],
       [2],
       [4]])

In [32]:
DSIacc(Y_test,y_pred)

0.39473684210526316

## classification model

In [33]:
model = Sequential()
model.add(Convolution1D(64, 3, border_mode='same', input_shape=(X.shape[1],X.shape[2])))
model.add(BatchNormalization())
model.add(Activation('relu'))
#model.add(AveragePooling1D(pool_length=3))

model.add(Convolution1D(128, 7, border_mode='same'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(AveragePooling1D(pool_length=3))

model.add(Convolution1D(192, 3, border_mode='same'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(AveragePooling1D(pool_length=3))

model.add(Convolution1D(256, 3, border_mode='same'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(AveragePooling1D(pool_length=3))


In [34]:
model.add(Flatten())  # this converts our 3D feature maps to 1D feature vectors
model.add(Dense(4096, init='normal'))
model.add(BatchNormalization())
model.add(Activation('relu'))

#model.add(Dense(4096, init='normal'))
model.add(Dense(1000, init='normal'))
model.add(BatchNormalization())
model.add(Activation('relu'))

#model.add(Dense(1000, init='normal'))
model.add(Dense(200, init='normal'))
model.add(BatchNormalization())
model.add(Activation('relu'))

In [35]:
#
model.add(Dense(9, activation='softmax')) # for categorical_cross_entropy, below (classification)
#model.add(Dense(1)) # for mse, below (regression)

In [36]:
#model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
model.compile(loss='mean_squared_error', optimizer='sgd', metrics=['mean_absolute_error', 'accuracy'])

In [37]:
#model.fit(data, labels, nb_epoch=10, batch_size=32) ### generic call from documentation
model.fit(X_train,Y_cat_train, nb_epoch=10, batch_size=32)
#model.fit(X_train, Y_cat_train, nb_epoch=10, batch_size= 1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2e36a5f10>

### evaluate

In [38]:
model.evaluate(X_test, Y_cat_test, batch_size=1)



[1.9382272260753732, 0.52708534308170019, 0.0]

In [39]:
y_pred = model.predict(X_test)
y_pred_class = [[(np.argmax(pred)+1)] for pred in y_pred]
y_pred_class[:10]

[[9], [9], [9], [9], [9], [9], [9], [9], [9], [9]]

In [40]:
Y_test[:10]

array([[4],
       [3],
       [1],
       [3],
       [3],
       [5],
       [2],
       [4],
       [2],
       [4]])

In [41]:
DSIacc(Y_test,y_pred_class)

0.05263157894736842

# DROPOUT

## regression model - WITH DROPOUT

In [42]:
model = Sequential()
model.add(Convolution1D(64, 3, border_mode='same', input_shape=(X.shape[1],X.shape[2])))
model.add(BatchNormalization())
model.add(Activation('relu'))
#model.add(AveragePooling1D(pool_length=3))

model.add(Convolution1D(128, 7, border_mode='same'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(AveragePooling1D(pool_length=3))

model.add(Convolution1D(192, 3, border_mode='same'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(AveragePooling1D(pool_length=3))

model.add(Convolution1D(256, 3, border_mode='same'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(AveragePooling1D(pool_length=3))


In [43]:
model.add(Flatten())  # this converts our 3D feature maps to 1D feature vectors
model.add(Dense(4096, init='normal'))
model.add(BatchNormalization())
model.add(Activation('relu'))

model.add(Dropout(.5))

#model.add(Dense(4096, init='normal'))
model.add(Dense(1000, init='normal'))
model.add(BatchNormalization())
model.add(Activation('relu'))

model.add(Dropout(.25))

#model.add(Dense(1000, init='normal'))
model.add(Dense(200, init='normal'))
model.add(BatchNormalization())
model.add(Activation('relu'))

In [44]:
#
model.add(Dense(1)) # for mse, below (regression)

In [45]:
#model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
model.compile(loss='mean_squared_error', optimizer='sgd', metrics=['mean_absolute_error', 'accuracy'])

In [46]:
#model.fit(X_train, Y_train, nb_epoch=10, batch_size= 1) #just like without dropout, this produced crazy predictions
model.fit(X_train,Y_train, nb_epoch=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2f2e36990>

### evaluate

In [47]:
model.evaluate(X_test, Y_test, batch_size=1)



[10.75673301184648, 2.7896891173563505, 0.10526315789473684]

In [48]:
y_pred = model.predict(X_test)
y_pred[:10]

array([[ 0.7896136 ],
       [ 0.68153262],
       [ 1.1205163 ],
       [ 0.94476587],
       [ 1.22108889],
       [ 1.24291325],
       [ 1.58358204],
       [ 1.77549183],
       [ 2.11056042],
       [ 0.94681972]], dtype=float32)

In [49]:
Y_test[:10]

array([[4],
       [3],
       [1],
       [3],
       [3],
       [5],
       [2],
       [4],
       [2],
       [4]])

In [50]:
DSIacc(Y_test,y_pred)

0.13157894736842105

## classification model - WITH DROPOUT

In [51]:
model = Sequential()
model.add(Convolution1D(64, 3, border_mode='same', input_shape=(X.shape[1],X.shape[2])))
model.add(BatchNormalization())
model.add(Activation('relu'))
#model.add(AveragePooling1D(pool_length=3))

model.add(Convolution1D(128, 7, border_mode='same'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(AveragePooling1D(pool_length=3))

model.add(Convolution1D(192, 3, border_mode='same'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(AveragePooling1D(pool_length=3))

model.add(Convolution1D(256, 3, border_mode='same'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(AveragePooling1D(pool_length=3))


In [52]:
model.add(Flatten())  # this converts our 3D feature maps to 1D feature vectors
model.add(Dense(4096, init='normal'))
model.add(BatchNormalization())
model.add(Activation('relu'))

model.add(Dropout(.5))

#model.add(Dense(4096, init='normal'))
model.add(Dense(1000, init='normal'))
model.add(BatchNormalization())
model.add(Activation('relu'))

model.add(Dropout(.25))

#model.add(Dense(1000, init='normal'))
model.add(Dense(200, init='normal'))
model.add(BatchNormalization())
model.add(Activation('relu'))

In [53]:
#
model.add(Dense(9, activation='softmax')) # for categorical_cross_entropy, below (classification)
#model.add(Dense(1)) # for mse, below (regression)

In [54]:
#model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
model.compile(loss='mean_squared_error', optimizer='sgd', metrics=['mean_absolute_error', 'accuracy'])

In [55]:
#model.fit(data, labels, nb_epoch=10, batch_size=32) ### generic call from documentation
#model.fit(X_train,Y_cat_train, nb_epoch=10, batch_size=32)
model.fit(X_train, Y_cat_train, nb_epoch=10, batch_size= 1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x3075dfad0>

### evaluate

In [56]:
model.evaluate(X_test, Y_cat_test, batch_size=1)



[2.0233917981386185, 0.5146198970706839, 0.15789473684210525]

In [57]:
y_pred = model.predict(X_test)
y_pred_class = [[(np.argmax(pred)+1)] for pred in y_pred]
y_pred_class[:10]

[[3], [1], [9], [3], [3], [3], [3], [3], [3], [1]]

In [58]:
Y_test[:10]

array([[4],
       [3],
       [1],
       [3],
       [3],
       [5],
       [2],
       [4],
       [2],
       [4]])

In [59]:
DSIacc(Y_test,y_pred_class)

0.47368421052631576

## with a larger batch size (spoiler, it does really badly)

In [132]:
#THE FULL SETUP, compacted
model = Sequential(); model.add(Convolution1D(64, 11, border_mode='same', input_shape=(X.shape[1],X.shape[2]))); model.add(BatchNormalization()); model.add(Activation('relu')); model.add(AveragePooling1D(pool_length=3)); model.add(Convolution1D(128, 7, border_mode='same')); model.add(BatchNormalization()); model.add(Activation('relu')); model.add(AveragePooling1D(pool_length=3)); model.add(Convolution1D(192, 3, border_mode='same')); model.add(BatchNormalization()); model.add(Activation('relu')); model.add(AveragePooling1D(pool_length=3)); model.add(Convolution1D(256, 3, border_mode='same')); model.add(BatchNormalization()); model.add(Activation('relu')); model.add(AveragePooling1D(pool_length=3))
model.add(Flatten()); model.add(Dense(4096, init='normal')); model.add(BatchNormalization()); model.add(Activation('relu')); model.add(Dropout(.5)); model.add(Dense(1000, init='normal')); model.add(BatchNormalization()); model.add(Activation('relu')); model.add(Dropout(.25)); model.add(Dense(200, init='normal')); model.add(BatchNormalization()); model.add(Activation('relu'))
#
model.add(Dense(9, activation='softmax')) # for categorical_cross_entropy, below (classification)
#model.add(Dense(1)) # for mse, below (regression)


In [133]:
model.compile(loss='mean_squared_error', optimizer='sgd', metrics=['mean_absolute_error', 'accuracy'])

In [134]:
model.fit(X_train,Y_cat_train, nb_epoch=10, batch_size=154)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2128cdad0>

### evaluate

In [135]:
model.evaluate(X_test, Y_cat_test, batch_size=1)



[1.4969771743605012, 0.4399002076763856, 0.18421052631578946]

In [136]:
y_pred = model.predict(X_test)
y_pred_class = [[(np.argmax(pred)+1)] for pred in y_pred]
y_pred_class[:10]

[[3], [3], [3], [3], [3], [3], [3], [3], [3], [3]]

In [137]:
Y_test[:10]

array([[2],
       [4],
       [5],
       [2],
       [2],
       [1],
       [5],
       [2],
       [3],
       [1]])

In [138]:
DSIacc(Y_test,y_pred_class)

0.5263157894736842

### NOTE: this looks a little encouraging, but it's not
It just guesses 3 for everything, and since about half the examples are 2,3,4, it counts those are correct. Also, if you run this identical thing a few times, it will come out differently, but always picking the same number for every testing observation. Last time it pick 7 for everything and got 10% accuracy.

I also tried batch size 32 and got about the same result.


## running for more epochs (back to batch size 1)

In [139]:
#THE FULL SETUP, compacted
model = Sequential(); model.add(Convolution1D(64, 11, border_mode='same', input_shape=(X.shape[1],X.shape[2]))); model.add(BatchNormalization()); model.add(Activation('relu')); model.add(AveragePooling1D(pool_length=3)); model.add(Convolution1D(128, 7, border_mode='same')); model.add(BatchNormalization()); model.add(Activation('relu')); model.add(AveragePooling1D(pool_length=3)); model.add(Convolution1D(192, 3, border_mode='same')); model.add(BatchNormalization()); model.add(Activation('relu')); model.add(AveragePooling1D(pool_length=3)); model.add(Convolution1D(256, 3, border_mode='same')); model.add(BatchNormalization()); model.add(Activation('relu')); model.add(AveragePooling1D(pool_length=3))
model.add(Flatten()); model.add(Dense(4096, init='normal')); model.add(BatchNormalization()); model.add(Activation('relu')); model.add(Dropout(.5)); model.add(Dense(1000, init='normal')); model.add(BatchNormalization()); model.add(Activation('relu')); model.add(Dropout(.25)); model.add(Dense(200, init='normal')); model.add(BatchNormalization()); model.add(Activation('relu'))
#
model.add(Dense(9, activation='softmax')) # for categorical_cross_entropy, below (classification)
#model.add(Dense(1)) # for mse, below (regression)


In [140]:
#model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
model.compile(loss='mean_squared_error', optimizer='sgd', metrics=['mean_absolute_error', 'accuracy'])

In [141]:
model.fit(X_train, Y_cat_train, nb_epoch=20, batch_size= 1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x218457950>

### evaluate

In [142]:
model.evaluate(X_test, Y_cat_test, batch_size=1)



[1.6345029257630046, 0.45906433737591695, 0.026315789473684209]

In [143]:
y_pred = model.predict(X_test)
y_pred_class = [[(np.argmax(pred)+1)] for pred in y_pred]
y_pred_class[:10]

[[8], [8], [4], [8], [8], [7], [3], [8], [8], [8]]

In [144]:
Y_test[:10]

array([[2],
       [4],
       [5],
       [2],
       [2],
       [1],
       [5],
       [2],
       [3],
       [1]])

In [145]:
DSIacc(Y_test,y_pred_class)

0.18421052631578946