In [9]:
import numpy as np
import pandas as pd
from tqdm import tqdm

import lingualTF as la

import tensorflow as tf
#import keras

from keras.models import Sequential
from keras.layers import Convolution1D, MaxPooling1D, AveragePooling1D
from keras.layers import Activation, Dropout, Flatten, Dense, BatchNormalization

In [10]:
####################################
# REPLACE THESE WITH CORRECT PATHS #
####################################
docRanks = pd.read_csv('/Users/Seth/Documents/DSI/Capstone/DSI-Religion-2017/refData/docRanks.csv')
#docRanks.head()

In [47]:
def DSIacc(y_true, y_pred):
    return float(len([i for i in range(len(y_pred)) if abs(y_true[i][0]-y_pred[i][0])<=1])/float(len(y_pred)))

## get data from file (processed in earlier scripts)

In [11]:
# OR load X matrix from disk (already processed)
X = np.load('/Users/Seth/Documents/DSI/Capstone/big-data/X-single-1k.npy')
X.shape

(193, 500, 1002)

In [12]:
# construct Y matrix
Y = np.array([np.array([y]) for y in docRanks['rank']])
Y.shape

(193, 1)

In [13]:
# make a Y vector for classification
Y_cat = np.zeros((len(Y),9))

for i in range(0,len(Y)):
    Y_cat[i][Y[i]-1] = Y[i]

Y_cat.shape

(193, 9)

## shuffle the data into training and testing

In [14]:
# set the seed if you want to
np.random.seed(123)
#
shuf = docRanks.sample(frac=1).index
shuf

Int64Index([ 72,   4,  79, 171, 112,  71,  90,  93, 182, 147,
            ...
             96,  57, 123, 106,  83,  17,  98,  66, 126, 109],
           dtype='int64', length=193)

In [15]:
splitPoint = int(len(shuf) * .8)
#train = docRanks.iloc[shuf[:splitPoint]]
#test = docRanks.iloc[shuf[(splitPoint+1):]]

In [16]:
X_train = X[shuf[:splitPoint]]
Y_train = Y[shuf[:splitPoint]]
Y_cat_train = Y_cat[shuf[:splitPoint]]

X_test = X[shuf[(splitPoint+1):]]
Y_test = Y[shuf[(splitPoint+1):]]
Y_cat_test = Y_cat[shuf[(splitPoint+1):]]

## regression model

In [195]:
model = Sequential()
model.add(Convolution1D(64, 11, border_mode='same', input_shape=(X.shape[1],X.shape[2])))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPooling1D(pool_length=3))

model.add(Convolution1D(128, 7, border_mode='same'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPooling1D(pool_length=3))

model.add(Convolution1D(192, 3, border_mode='same'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPooling1D(pool_length=3))

model.add(Convolution1D(256, 3, border_mode='same'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPooling1D(pool_length=3))


In [196]:
model.add(Flatten())  # this converts our 3D feature maps to 1D feature vectors
model.add(Dense(4096, init='normal'))
model.add(BatchNormalization())
model.add(Activation('relu'))

#model.add(Dense(4096, init='normal'))
model.add(Dense(1000, init='normal'))
model.add(BatchNormalization())
model.add(Activation('relu'))

#model.add(Dense(1000, init='normal'))
model.add(Dense(200, init='normal'))
model.add(BatchNormalization())
model.add(Activation('relu'))

In [197]:
#
#model.add(Dense(9, activation='softmax')) # for categorical_cross_entropy, below (classification)
model.add(Dense(1)) # for mse, below (regression)

In [198]:
def DSIacc(y_true, y_pred):
    return float(len([i for i in range(len(y_pred)) if abs(y_true[i][0]-y_pred[i][0])<=1])/float(len(y_pred)))

In [199]:
#model.compile(loss='categorical_cross_entropy', optimizer='rmsprop', metrics=['accuracy'])
model.compile(loss='mean_squared_error', optimizer='sgd', metrics=['mean_absolute_error'])

### Batch size mattered a lot on this one
with `batch_size=1` we got huge numbers and 0 accuracy, with `batch_size=154` we got tiny numbers and 23% accuracy (basically all the 1's and maybe a few 2's). With `batch_size=32` we did a little better, but still not convincing...

In [200]:
#model.fit(X_train, Y_train, nb_epoch=10, batch_size=1)
#model.fit(X_train, Y_train, nb_epoch=10, batch_size= 154)
model.fit(X_train, Y_train, nb_epoch=10, batch_size=32) # 32 did way better than 1 on regression (opposite of classification)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x23e14f350>

### evaluate

In [201]:
model.evaluate(X_test, Y_test, batch_size=1)



[5.7734026156708991, 1.7655861816908185]

In [202]:
y_pred = model.predict(X_test)
y_pred[:10]

array([[ 1.79850388],
       [ 1.10067928],
       [ 2.12132692],
       [ 1.8552072 ],
       [ 2.17894769],
       [ 1.88807547],
       [ 2.08990836],
       [ 1.97027564],
       [ 1.90136445],
       [ 2.4523294 ]], dtype=float32)

In [203]:
Y_test[:10]

array([[2],
       [4],
       [5],
       [2],
       [2],
       [1],
       [5],
       [2],
       [3],
       [1]])

In [204]:
DSIacc(Y_test,y_pred)

0.34210526315789475

## classification model

In [205]:
model = Sequential()
model.add(Convolution1D(64, 11, border_mode='same', input_shape=(X.shape[1],X.shape[2])))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPooling1D(pool_length=3))

model.add(Convolution1D(128, 7, border_mode='same'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPooling1D(pool_length=3))

model.add(Convolution1D(192, 3, border_mode='same'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPooling1D(pool_length=3))

model.add(Convolution1D(256, 3, border_mode='same'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPooling1D(pool_length=3))


In [206]:
model.add(Flatten())  # this converts our 3D feature maps to 1D feature vectors
model.add(Dense(4096, init='normal'))
model.add(BatchNormalization())
model.add(Activation('relu'))

#model.add(Dense(4096, init='normal'))
model.add(Dense(1000, init='normal'))
model.add(BatchNormalization())
model.add(Activation('relu'))

#model.add(Dense(1000, init='normal'))
model.add(Dense(200, init='normal'))
model.add(BatchNormalization())
model.add(Activation('relu'))

In [207]:
#
model.add(Dense(9, activation='softmax')) # for categorical_cross_entropy, below (classification)
#model.add(Dense(1)) # for mse, below (regression)

In [208]:
#model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
model.compile(loss='mean_squared_error', optimizer='sgd', metrics=['mean_absolute_error', 'accuracy'])

In [209]:
#model.fit(data, labels, nb_epoch=10, batch_size=32) ### generic call from documentation
model.fit(X_train,Y_cat_train, nb_epoch=10, batch_size=32)
#model.fit(X_train, Y_cat_train, nb_epoch=10, batch_size= 1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x25253af50>

### evaluate

In [210]:
model.evaluate(X_test, Y_cat_test, batch_size=1)



[1.5172321210174184, 0.44397439062595367, 0.18421052631578946]

In [211]:
y_pred = model.predict(X_test)
y_pred_class = [[(np.argmax(pred)+1)] for pred in y_pred]
y_pred_class[:10]

[[3], [3], [3], [3], [3], [3], [3], [3], [3], [3]]

In [212]:
Y_test[:10]

array([[2],
       [4],
       [5],
       [2],
       [2],
       [1],
       [5],
       [2],
       [3],
       [1]])

In [213]:
DSIacc(Y_test,y_pred_class)

0.5263157894736842

# DROPOUT

## regression model - WITH DROPOUT

In [219]:
model = Sequential()
model.add(Convolution1D(64, 11, border_mode='same', input_shape=(X.shape[1],X.shape[2])))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPooling1D(pool_length=3))

model.add(Convolution1D(128, 7, border_mode='same'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPooling1D(pool_length=3))

model.add(Convolution1D(192, 3, border_mode='same'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPooling1D(pool_length=3))

model.add(Convolution1D(256, 3, border_mode='same'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPooling1D(pool_length=3))


In [220]:
model.add(Flatten())  # this converts our 3D feature maps to 1D feature vectors
model.add(Dense(4096, init='normal'))
model.add(BatchNormalization())
model.add(Activation('relu'))

model.add(Dropout(.5))

#model.add(Dense(4096, init='normal'))
model.add(Dense(1000, init='normal'))
model.add(BatchNormalization())
model.add(Activation('relu'))

model.add(Dropout(.25))

#model.add(Dense(1000, init='normal'))
model.add(Dense(200, init='normal'))
model.add(BatchNormalization())
model.add(Activation('relu'))

In [221]:
#
model.add(Dense(1)) # for mse, below (regression)

In [222]:
#model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
model.compile(loss='mean_squared_error', optimizer='sgd', metrics=['mean_absolute_error', 'accuracy'])

In [223]:
#model.fit(X_train, Y_train, nb_epoch=10, batch_size= 1) #just like without dropout, this produced crazy predictions
model.fit(X_train,Y_train, nb_epoch=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x255001e90>

### evaluate

In [224]:
model.evaluate(X_test, Y_test, batch_size=1)



[5.977332562992447, 1.8425006646859019, 0.15789473684210525]

In [225]:
y_pred = model.predict(X_test)
y_pred[:10]

array([[ 1.48422098],
       [ 1.85205781],
       [ 1.77789748],
       [ 1.37081945],
       [ 1.16880774],
       [ 1.69702923],
       [ 2.09696007],
       [ 1.50669611],
       [ 1.35655892],
       [ 1.50866592]], dtype=float32)

In [226]:
Y_test[:10]

array([[2],
       [4],
       [5],
       [2],
       [2],
       [1],
       [5],
       [2],
       [3],
       [1]])

In [227]:
DSIacc(Y_test,y_pred)

0.4473684210526316

## classification model - WITH DROPOUT

In [155]:
model = Sequential()
model.add(Convolution1D(64, 11, border_mode='same', input_shape=(X.shape[1],X.shape[2])))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPooling1D(pool_length=3))

model.add(Convolution1D(128, 7, border_mode='same'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPooling1D(pool_length=3))

model.add(Convolution1D(192, 3, border_mode='same'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPooling1D(pool_length=3))

model.add(Convolution1D(256, 3, border_mode='same'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPooling1D(pool_length=3))


In [156]:
model.add(Flatten())  # this converts our 3D feature maps to 1D feature vectors
model.add(Dense(4096, init='normal'))
model.add(BatchNormalization())
model.add(Activation('relu'))

model.add(Dropout(.5))

#model.add(Dense(4096, init='normal'))
model.add(Dense(1000, init='normal'))
model.add(BatchNormalization())
model.add(Activation('relu'))

model.add(Dropout(.25))

#model.add(Dense(1000, init='normal'))
model.add(Dense(200, init='normal'))
model.add(BatchNormalization())
model.add(Activation('relu'))

In [157]:
#
model.add(Dense(9, activation='softmax')) # for categorical_cross_entropy, below (classification)
#model.add(Dense(1)) # for mse, below (regression)

In [158]:
#model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
model.compile(loss='mean_squared_error', optimizer='sgd', metrics=['mean_absolute_error', 'accuracy'])

In [159]:
#model.fit(data, labels, nb_epoch=10, batch_size=32) ### generic call from documentation
#model.fit(X_train,Y_cat_train, nb_epoch=10, batch_size=32)
model.fit(X_train, Y_cat_train, nb_epoch=10, batch_size= 1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x22e5745d0>

### evaluate

In [160]:
model.evaluate(X_test, Y_cat_test, batch_size=1)



[1.628654969758109, 0.45906433737591695, 0.026315789473684209]

In [161]:
y_pred = model.predict(X_test)
y_pred_class = [[(np.argmax(pred)+1)] for pred in y_pred]
y_pred_class[:10]

[[8], [6], [6], [8], [6], [6], [8], [6], [6], [6]]

In [162]:
Y_test[:10]

array([[2],
       [4],
       [5],
       [2],
       [2],
       [1],
       [5],
       [2],
       [3],
       [1]])

In [163]:
DSIacc(Y_test,y_pred_class)

0.18421052631578946

## with a larger batch size (spoiler, it does really badly)

In [132]:
#THE FULL SETUP, compacted
model = Sequential(); model.add(Convolution1D(64, 11, border_mode='same', input_shape=(X.shape[1],X.shape[2]))); model.add(BatchNormalization()); model.add(Activation('relu')); model.add(MaxPooling1D(pool_length=3)); model.add(Convolution1D(128, 7, border_mode='same')); model.add(BatchNormalization()); model.add(Activation('relu')); model.add(MaxPooling1D(pool_length=3)); model.add(Convolution1D(192, 3, border_mode='same')); model.add(BatchNormalization()); model.add(Activation('relu')); model.add(MaxPooling1D(pool_length=3)); model.add(Convolution1D(256, 3, border_mode='same')); model.add(BatchNormalization()); model.add(Activation('relu')); model.add(MaxPooling1D(pool_length=3))
model.add(Flatten()); model.add(Dense(4096, init='normal')); model.add(BatchNormalization()); model.add(Activation('relu')); model.add(Dropout(.5)); model.add(Dense(1000, init='normal')); model.add(BatchNormalization()); model.add(Activation('relu')); model.add(Dropout(.25)); model.add(Dense(200, init='normal')); model.add(BatchNormalization()); model.add(Activation('relu'))
#
model.add(Dense(9, activation='softmax')) # for categorical_cross_entropy, below (classification)
#model.add(Dense(1)) # for mse, below (regression)


In [133]:
model.compile(loss='mean_squared_error', optimizer='sgd', metrics=['mean_absolute_error', 'accuracy'])

In [134]:
model.fit(X_train,Y_cat_train, nb_epoch=10, batch_size=154)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2128cdad0>

### evaluate

In [135]:
model.evaluate(X_test, Y_cat_test, batch_size=1)



[1.4969771743605012, 0.4399002076763856, 0.18421052631578946]

In [136]:
y_pred = model.predict(X_test)
y_pred_class = [[(np.argmax(pred)+1)] for pred in y_pred]
y_pred_class[:10]

[[3], [3], [3], [3], [3], [3], [3], [3], [3], [3]]

In [137]:
Y_test[:10]

array([[2],
       [4],
       [5],
       [2],
       [2],
       [1],
       [5],
       [2],
       [3],
       [1]])

In [138]:
DSIacc(Y_test,y_pred_class)

0.5263157894736842

### NOTE: this looks a little encouraging, but it's not
It just guesses 3 for everything, and since about half the examples are 2,3,4, it counts those are correct. Also, if you run this identical thing a few times, it will come out differently, but always picking the same number for every testing observation. Last time it pick 7 for everything and got 10% accuracy.

I also tried batch size 32 and got about the same result.


## running for more epochs (back to batch size 1)

In [139]:
#THE FULL SETUP, compacted
model = Sequential(); model.add(Convolution1D(64, 11, border_mode='same', input_shape=(X.shape[1],X.shape[2]))); model.add(BatchNormalization()); model.add(Activation('relu')); model.add(MaxPooling1D(pool_length=3)); model.add(Convolution1D(128, 7, border_mode='same')); model.add(BatchNormalization()); model.add(Activation('relu')); model.add(MaxPooling1D(pool_length=3)); model.add(Convolution1D(192, 3, border_mode='same')); model.add(BatchNormalization()); model.add(Activation('relu')); model.add(MaxPooling1D(pool_length=3)); model.add(Convolution1D(256, 3, border_mode='same')); model.add(BatchNormalization()); model.add(Activation('relu')); model.add(MaxPooling1D(pool_length=3))
model.add(Flatten()); model.add(Dense(4096, init='normal')); model.add(BatchNormalization()); model.add(Activation('relu')); model.add(Dropout(.5)); model.add(Dense(1000, init='normal')); model.add(BatchNormalization()); model.add(Activation('relu')); model.add(Dropout(.25)); model.add(Dense(200, init='normal')); model.add(BatchNormalization()); model.add(Activation('relu'))
#
model.add(Dense(9, activation='softmax')) # for categorical_cross_entropy, below (classification)
#model.add(Dense(1)) # for mse, below (regression)


In [140]:
#model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
model.compile(loss='mean_squared_error', optimizer='sgd', metrics=['mean_absolute_error', 'accuracy'])

In [141]:
model.fit(X_train, Y_cat_train, nb_epoch=20, batch_size= 1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x218457950>

### evaluate

In [142]:
model.evaluate(X_test, Y_cat_test, batch_size=1)



[1.6345029257630046, 0.45906433737591695, 0.026315789473684209]

In [143]:
y_pred = model.predict(X_test)
y_pred_class = [[(np.argmax(pred)+1)] for pred in y_pred]
y_pred_class[:10]

[[8], [8], [4], [8], [8], [7], [3], [8], [8], [8]]

In [144]:
Y_test[:10]

array([[2],
       [4],
       [5],
       [2],
       [2],
       [1],
       [5],
       [2],
       [3],
       [1]])

In [145]:
DSIacc(Y_test,y_pred_class)

0.18421052631578946