In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm

import lingualTF as la

import tensorflow as tf
#import keras

from keras.models import Sequential
from keras.layers import Convolution1D, MaxPooling1D, AveragePooling1D
from keras.layers import Activation, Dropout, Flatten, Dense

Using TensorFlow backend.


In [50]:
####################################
# REPLACE THESE WITH CORRECT PATHS #
####################################
docsFolder = '/Users/Seth/Documents/DSI/Capstone/DSI-Religion-2017/data_dsicap_single/'
docRanks = pd.read_csv('/Users/Seth/Documents/DSI/Capstone/DSI-Religion-2017/refData/docRanks.csv')
docRanks.shape

(280, 2)

In [51]:
docRanks = docRanks.ix[~((docRanks['groupName']=='YV03')|(docRanks['groupName']=='YV04'))]
docRanks.reset_index(inplace=True, drop=True)
docRanks.tail()

Unnamed: 0,groupName,rank
273,WBC347,2
274,WBC410,1
275,WBC418,1
276,WBC421,1
277,WBC422,1


In [52]:
#vocab from this list http://www.wordfrequency.info/5k_lemmas_download.asp
w5k = pd.read_table('refData/5000-words.txt')

## process words to match lingualObject processing
w5k['Word'] = la.cleanTokens(w5k['Word'])

vocab = list(w5k['Word'][:1000]) ##### USE ONLY THE TOP 1000 (we could try more later)

# add UNKNOWN and VALUE tokens
vocab += ['UNK', 'VAL']
len(vocab)

1002

In [53]:
# Parts of Speech ##### Thought about using only POS + UNK + VAL, could be an option...
pos = list(set(w5k['POS']))
pos

['a', 'c', 'e', 'd', 'i', 'j', 'm', 'n', 'p', 'r', 'u', 't', 'v', 'x']

## Input Documents

In [54]:
def docToNumpy(doc, vocab, keywordCount=20, lengthThreshold=500, warnOnPad=False):
    # tokenize, etc. document
    self = la.lingualObject(doc)
    fileName = self.fileList[0]
    # set the keywords
    self.setKeywords(wordCount=keywordCount)
    # create array of each word in doc with len(vocab)-dimensional sparse one-hot vectors
    sparse = [la.wordToInd(word, vocab, self.keywords) for word in self.tokens[fileName]]
    # if less words than the threshold, pad with UNK
    if len(sparse) < lengthThreshold:
        if warnOnPad==True:
            print('PADDING: ' + doc + ' has only ' + str(len(sparse)) + ' words. Adding ' + str((lengthThreshold - len(sparse))) + ' UNKs.')
        sparse += [(len(vocab) - 2)] * (lengthThreshold - len(sparse))
    # create dense array of zeros, then fill in the correct ones
    docArr = np.zeros((lengthThreshold,len(vocab)), dtype=np.int) # default is 500, 1002
    for i in range(0, lengthThreshold):
        docArr[i][sparse[i]] = 1
    # 
    return docArr

## Process data (set up X and Y as numpy arrays)

In [40]:
# construct X matrix FROM SCRATCH
X = [docToNumpy(docsFolder + adoc + '/raw/' + adoc + '.txt', vocab) for adoc in tqdm(docRanks['groupName'])]
X = np.array([docArr for docArr in X if docArr is not None])
np.save('/Users/Seth/Documents/DSI/Capstone/big-data/X-single-1k-278.npy', X)
X.shape

100%|██████████| 278/278 [02:11<00:00,  1.08it/s]


(278, 500, 1002)

In [41]:
# OR load X matrix from disk (already processed)
#X = np.load('/Users/Seth/Documents/DSI/Capstone/big-data/X-single-1k-193.npy')
X.shape

(278, 500, 1002)

In [55]:
# construct Y matrix
Y = np.array([np.array([y]) for y in docRanks['rank']])
Y.shape

(278, 1)

In [56]:
# make a Y vector for classification
Y_cat = np.zeros((len(Y),9))

for i in range(0,len(Y)):
    Y_cat[i][Y[i]-1] = Y[i]

Y_cat.shape

(278, 9)

## shuffle the data into training and testing

In [57]:
# set the seed if you want to
np.random.seed(123)
#
shuf = docRanks.sample(frac=1).index
shuf

Int64Index([223,  20, 163,  29, 254, 237,  53, 174, 151, 156,
            ...
             96, 225, 214,  57, 123, 106,  83,  17, 230,  98],
           dtype='int64', length=278)

In [58]:
splitPoint = int(len(shuf) * .8)
#train = docRanks.iloc[shuf[:splitPoint]]
#test = docRanks.iloc[shuf[(splitPoint+1):]]
splitPoint

222

In [59]:
X_train = X[shuf[:splitPoint]]
Y_train = Y[shuf[:splitPoint]]
Y_cat_train = Y_cat[shuf[:splitPoint]]

X_test = X[shuf[(splitPoint+1):]]
Y_test = Y[shuf[(splitPoint+1):]]
Y_cat_test = Y_cat[shuf[(splitPoint+1):]]

## train the regression model

In [78]:
model = Sequential()
model.add(Convolution1D(64, 5, border_mode='same', input_shape=(X.shape[1],X.shape[2]), activation='relu'))
model.add(Convolution1D(32, 3, border_mode='same', activation='relu'))
model.add(AveragePooling1D(pool_length=2))
model.add(Convolution1D(32, 3, border_mode='same', activation='relu'))
model.add(MaxPooling1D(pool_length=2))


In [79]:
model.add(Flatten())  # this converts our 3D feature maps to 1D feature vectors
model.add(Dense(64))
model.add(Activation('relu'))
#model.add(Dropout(0.25))
#
#model.add(Dense(9, activation='softmax')) # for categorical_cross_entropy, below (classification)
model.add(Dense(1)) # for mse, below (regression)

In [80]:
#model.compile(loss='categorical_cross_entropy', optimizer='rmsprop', metrics=['accuracy'])
model.compile(loss='mean_squared_error', optimizer='sgd', metrics=['mean_absolute_error'])

In [81]:
#model.fit(data, labels, nb_epoch=10, batch_size=32) ### generic call from documentation
model.fit(X_train, Y_train, nb_epoch=10, batch_size=32)
#model.fit(X_train, Y_train, nb_epoch=10, batch_size= 1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1572f1510>

### evaluate

In [82]:
model.evaluate(X_test, Y_test, batch_size=1)



[3.2785804767500268, 1.414863161607222]

In [83]:
y_pred = model.predict(X_test)
y_pred[:15]

array([[ 3.57582235],
       [ 3.57571745],
       [ 3.57571578],
       [ 3.57579255],
       [ 3.57573724],
       [ 3.57581782],
       [ 3.5758729 ],
       [ 3.57578802],
       [ 3.5757606 ],
       [ 3.57588077],
       [ 3.57573581],
       [ 3.57574964],
       [ 3.57576323],
       [ 3.57579875],
       [ 3.57584596]], dtype=float32)

In [84]:
Y_test[:15]

array([[6],
       [2],
       [6],
       [5],
       [1],
       [8],
       [5],
       [2],
       [2],
       [1],
       [4],
       [1],
       [6],
       [9],
       [5]])

In [85]:
def DSIacc(y_true, y_pred):
    return float(len([i for i in range(len(y_pred)) if abs(y_true[i][0]-y_pred[i][0])<=1])/float(len(y_pred)))

In [86]:
DSIacc(Y_test, y_pred)


0.45454545454545453

## train the classification model

In [87]:
model = Sequential()
model.add(Convolution1D(64, 5, border_mode='same', input_shape=(X.shape[1],X.shape[2]), activation='relu'))
model.add(Convolution1D(32, 3, border_mode='same', activation='relu'))
model.add(AveragePooling1D(pool_length=2))
model.add(Convolution1D(32, 3, border_mode='same', activation='relu'))
model.add(MaxPooling1D(pool_length=2))


In [88]:
model.add(Flatten())  # this converts our 3D feature maps to 1D feature vectors
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(0.25))
#
model.add(Dense(9, activation='softmax')) # for categorical_cross_entropy, below (classification)
#model.add(Dense(1)) # for mse, below (regression)

In [89]:
model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])
#model.compile(loss='mean_squared_error', optimizer='sgd', metrics=['mean_absolute_error'])

In [90]:
#model.fit(data, labels, nb_epoch=10, batch_size=32) ### generic call from documentation
#model.fit(X_train, Y_cat_train, nb_epoch=10, batch_size=32)
model.fit(X_train, Y_cat_train, nb_epoch=10, batch_size= 1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1658b18d0>

### evaluate

In [91]:
model.evaluate(X_test, Y_cat_test, batch_size=1)



[7.5096577687696975, 0.30909090909090908]

In [92]:
y_pred = model.predict(X_test)

In [93]:
# convert softmax vectors to class picks
y_pred_class = [[(np.argmax(pred)+1)] for pred in y_pred]

In [94]:
y_pred_class[:15]

[[4], [4], [4], [4], [4], [4], [4], [4], [4], [4], [4], [4], [4], [4], [4]]

In [95]:
DSIacc(Y_test, y_pred_class)

0.6

In [96]:
y_pred

array([[ 0.04131708,  0.05839049,  0.17507198,  0.25584197,  0.12958422,
         0.16597278,  0.08131333,  0.08702399,  0.00548415],
       [ 0.04131708,  0.05839049,  0.17507198,  0.25584197,  0.12958422,
         0.16597278,  0.08131333,  0.08702399,  0.00548415],
       [ 0.04131708,  0.05839049,  0.17507198,  0.25584197,  0.12958422,
         0.16597278,  0.08131333,  0.08702399,  0.00548415],
       [ 0.04131708,  0.05839049,  0.17507198,  0.25584197,  0.12958422,
         0.16597278,  0.08131333,  0.08702399,  0.00548415],
       [ 0.04131708,  0.05839049,  0.17507198,  0.25584197,  0.12958422,
         0.16597278,  0.08131333,  0.08702399,  0.00548415],
       [ 0.04131708,  0.05839049,  0.17507198,  0.25584197,  0.12958422,
         0.16597278,  0.08131333,  0.08702399,  0.00548415],
       [ 0.04131708,  0.05839049,  0.17507198,  0.25584197,  0.12958422,
         0.16597278,  0.08131333,  0.08702399,  0.00548415],
       [ 0.04131708,  0.05839049,  0.17507198,  0.25584197,  0