In [19]:
import numpy as np
import pandas as pd
from tqdm import tqdm

import lingualTF as la

import tensorflow as tf
#import keras

from keras.models import Sequential
from keras.layers import Convolution1D, MaxPooling1D, AveragePooling1D
from keras.layers import Activation, Dropout, Flatten, Dense

In [20]:
####################################
# REPLACE THESE WITH CORRECT PATHS #
####################################
docsFolder = '/Users/Seth/Documents/DSI/Capstone/DSI-Religion-2017/data_dsicap_single/'
docRanks = pd.read_csv('/Users/Seth/Documents/DSI/Capstone/DSI-Religion-2017/refData/docRanks.csv')
#docRanks.head()

In [21]:
#vocab from this list http://www.wordfrequency.info/5k_lemmas_download.asp
w5k = pd.read_table('refData/5000-words.txt')

## process words to match lingualObject processing
w5k['Word'] = la.cleanTokens(w5k['Word'])

vocab = list(w5k['Word'][:1000]) ##### USE ONLY THE TOP 1000 (we could try more later)

# add UNKNOWN and VALUE tokens
vocab += ['UNK', 'VAL']
len(vocab)

1002

In [22]:
# Parts of Speech ##### Thought about using only POS + UNK + VAL, could be an option...
pos = list(set(w5k['POS']))
pos

['a', 'c', 'e', 'd', 'i', 'j', 'm', 'n', 'p', 'r', 'u', 't', 'v', 'x']

## Input Documents

In [23]:
def docToNumpy(doc, vocab, keywordCount=20, lengthThreshold=500, warnOnPad=False):
    # tokenize, etc. document
    self = la.lingualObject(doc)
    fileName = self.fileList[0]
    # set the keywords
    self.setKeywords(wordCount=keywordCount)
    # create array of each word in doc with len(vocab)-dimensional sparse one-hot vectors
    sparse = [la.wordToInd(word, vocab, self.keywords) for word in self.tokens[fileName]]
    # if less words than the threshold, pad with UNK
    if len(sparse) < lengthThreshold:
        if warnOnPad==True:
            print('PADDING: ' + doc + ' has only ' + str(len(sparse)) + ' words. Adding ' + str((lengthThreshold - len(sparse))) + ' UNKs.')
        sparse += [(len(vocab) - 2)] * (lengthThreshold - len(sparse))
    # create dense array of zeros, then fill in the correct ones
    docArr = np.zeros((lengthThreshold,len(vocab)), dtype=np.int) # default is 500, 1002
    for i in range(0, lengthThreshold):
        docArr[i][sparse[i]] = 1
    # 
    return docArr

## Process data (set up X and Y as numpy arrays)

In [26]:
# construct X matrix FROM SCRATCH
#X = np.array([docToNumpy(docsFolder + adoc + '/raw/' + adoc + '.txt', vocab) for adoc in tqdm(docRanks['groupName'])])
#np.save('/Users/Seth/Documents/DSI/Capstone/big-data/X-single-1k.npy', X)
#X.shape

In [25]:
# OR load X matrix from disk (already processed)
X = np.load('/Users/Seth/Documents/DSI/Capstone/big-data/X-single-1k.npy')
X.shape

(193, 500, 1002)

In [27]:
# construct Y matrix
Y = np.array([np.array([y]) for y in docRanks['rank']])
Y.shape

(193, 1)

In [28]:
# make a Y vector for classification
Y_cat = np.zeros((len(Y),9))

for i in range(0,len(Y)):
    Y_cat[i][Y[i]-1] = Y[i]

Y_cat.shape

(193, 9)

## shuffle the data into training and testing

In [29]:
# set the seed if you want to
np.random.seed(123)
#
shuf = docRanks.sample(frac=1).index
shuf

Int64Index([ 72,   4,  79, 171, 112,  71,  90,  93, 182, 147,
            ...
             96,  57, 123, 106,  83,  17,  98,  66, 126, 109],
           dtype='int64', length=193)

In [30]:
splitPoint = int(len(shuf) * .8)
#train = docRanks.iloc[shuf[:splitPoint]]
#test = docRanks.iloc[shuf[(splitPoint+1):]]

In [40]:
X_train = X[shuf[:splitPoint]]
Y_train = Y[shuf[:splitPoint]]
Y_cat_train = Y_cat[shuf[:splitPoint]]

X_test = X[shuf[(splitPoint+1):]]
Y_test = Y[shuf[(splitPoint+1):]]
Y_cat_test = Y_cat[shuf[(splitPoint+1):]]

## train the regression model

In [89]:
model = Sequential()
model.add(Convolution1D(64, 5, border_mode='same', input_shape=(X.shape[1],X.shape[2]), activation='relu'))
model.add(Convolution1D(32, 3, border_mode='same', activation='relu'))
model.add(AveragePooling1D(pool_length=2))
model.add(Convolution1D(32, 3, border_mode='same', activation='relu'))
model.add(MaxPooling1D(pool_length=2))


In [90]:
model.add(Flatten())  # this converts our 3D feature maps to 1D feature vectors
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(0.25))
#
#model.add(Dense(9, activation='softmax')) # for categorical_cross_entropy, below (classification)
model.add(Dense(1)) # for mse, below (regression)

In [91]:
#model.compile(loss='categorical_cross_entropy', optimizer='rmsprop', metrics=['accuracy'])
model.compile(loss='mean_squared_error', optimizer='sgd', metrics=['mean_absolute_error'])

In [92]:
#model.fit(data, labels, nb_epoch=10, batch_size=32) ### generic call from documentation
#model.fit(X_train, Y_train, nb_epoch=10, batch_size=32)
model.fit(X_train, Y_train, nb_epoch=10, batch_size= 1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x11e4cea50>

### evaluate

In [93]:
model.evaluate(X_test, Y_test, batch_size=1)



[3.9986638426780701, 1.6649818671377081]

In [94]:
y_pred = model.predict(X_test)

In [95]:
def DSIacc(y_true, y_pred):
    return float(len([i for i in range(len(y_pred)) if abs(y_true[i][0]-y_pred[i][0])<=1])/float(len(y_pred)))

In [96]:
DSIacc(Y_test, y_pred)


0.3157894736842105

## train the classification model

In [97]:
model = Sequential()
model.add(Convolution1D(64, 5, border_mode='same', input_shape=(X.shape[1],X.shape[2]), activation='relu'))
model.add(Convolution1D(32, 3, border_mode='same', activation='relu'))
model.add(AveragePooling1D(pool_length=2))
model.add(Convolution1D(32, 3, border_mode='same', activation='relu'))
model.add(MaxPooling1D(pool_length=2))


In [98]:
model.add(Flatten())  # this converts our 3D feature maps to 1D feature vectors
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(0.25))
#
model.add(Dense(9, activation='softmax')) # for categorical_cross_entropy, below (classification)
#model.add(Dense(1)) # for mse, below (regression)

In [99]:
model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])
#model.compile(loss='mean_squared_error', optimizer='sgd', metrics=['mean_absolute_error'])

In [101]:
#model.fit(data, labels, nb_epoch=10, batch_size=32) ### generic call from documentation
#model.fit(X_train, Y_cat_train, nb_epoch=10, batch_size=32)
model.fit(X_train, Y_cat_train, nb_epoch=10, batch_size= 1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x11b41f7d0>

### evaluate

In [102]:
model.evaluate(X_test, Y_cat_test, batch_size=1)



[6.9426870157844141, 0.18421052631578946]

In [103]:
y_pred = model.predict(X_test)

In [104]:
# convert softmax vectors to class picks
y_pred_class = [[(np.argmax(pred)+1)] for pred in y_pred]

In [105]:
DSIacc(Y_test, y_pred_class)

0.5263157894736842

In [107]:
y_pred

array([[ 0.05280384,  0.08525699,  0.2155129 ,  0.09489761,  0.20761286,
         0.15884048,  0.0356454 ,  0.12994955,  0.01948033],
       [ 0.05280384,  0.08525699,  0.2155129 ,  0.09489761,  0.20761286,
         0.15884048,  0.0356454 ,  0.12994955,  0.01948033],
       [ 0.05280384,  0.08525699,  0.2155129 ,  0.09489761,  0.20761286,
         0.15884048,  0.0356454 ,  0.12994955,  0.01948033],
       [ 0.05280384,  0.08525699,  0.2155129 ,  0.09489761,  0.20761286,
         0.15884048,  0.0356454 ,  0.12994955,  0.01948033],
       [ 0.05280384,  0.08525699,  0.2155129 ,  0.09489761,  0.20761286,
         0.15884048,  0.0356454 ,  0.12994955,  0.01948033],
       [ 0.05280384,  0.08525699,  0.2155129 ,  0.09489761,  0.20761286,
         0.15884048,  0.0356454 ,  0.12994955,  0.01948033],
       [ 0.05280384,  0.08525699,  0.2155129 ,  0.09489761,  0.20761286,
         0.15884048,  0.0356454 ,  0.12994955,  0.01948033],
       [ 0.05280384,  0.08525699,  0.2155129 ,  0.09489761,  0