In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm

import lingualTF as la

import tensorflow as tf
#import keras

from keras.models import Sequential
from keras.layers import Convolution1D, MaxPooling1D, AveragePooling1D
from keras.layers import Activation, Dropout, Flatten, Dense

Using TensorFlow backend.


In [2]:
####################################
# REPLACE THESE WITH CORRECT PATHS #
####################################
docsFolder = '/Users/Seth/Documents/DSI/Capstone/DSI-Religion-2017/data_dsicap_single/'
docRanks = pd.read_csv('/Users/Seth/Documents/DSI/Capstone/DSI-Religion-2017/refData/docRanks.csv')
#docRanks.head()

In [3]:
#vocab from this list http://www.wordfrequency.info/5k_lemmas_download.asp
w5k = pd.read_table('refData/5000-words.txt')

## process words to match lingualObject processing
w5k['Word'] = la.cleanTokens(w5k['Word'])

vocab = list(w5k['Word'][:1000]) ##### USE ONLY THE TOP 1000 (we could try more later)

# add UNKNOWN and VALUE tokens
vocab += ['UNK', 'VAL']
len(vocab)

1002

In [4]:
# Parts of Speech ##### Thought about using only POS + UNK + VAL, could be an option...
pos = list(set(w5k['POS']))
pos

['a', 'c', 'e', 'd', 'i', 'j', 'm', 'n', 'p', 'r', 'u', 't', 'v', 'x']

## Input Documents

In [5]:
def docToNumpy(doc, vocab, keywordCount=20, lengthThreshold=500, warnOnPad=False):
    # tokenize, etc. document
    self = la.lingualObject(doc)
    fileName = self.fileList[0]
    # set the keywords
    self.setKeywords(wordCount=keywordCount)
    # create array of each word in doc with len(vocab)-dimensional sparse one-hot vectors
    sparse = [la.wordToInd(word, vocab, self.keywords) for word in self.tokens[fileName]]
    # if less words than the threshold, pad with UNK
    if len(sparse) < lengthThreshold:
        if warnOnPad==True:
            print('PADDING: ' + doc + ' has only ' + str(len(sparse)) + ' words. Adding ' + str((lengthThreshold - len(sparse))) + ' UNKs.')
        sparse += [(len(vocab) - 2)] * (lengthThreshold - len(sparse))
    # create dense array of zeros, then fill in the correct ones
    docArr = np.zeros((lengthThreshold,len(vocab)), dtype=np.int) # default is 500, 1002
    for i in range(0, lengthThreshold):
        docArr[i][sparse[i]] = 1
    # 
    return docArr

## Process data (set up X and Y as numpy arrays)

In [6]:
# construct X matrix FROM SCRATCH
X = np.array([docToNumpy(docsFolder + adoc + '/raw/' + adoc + '.txt', vocab) for adoc in tqdm(docRanks['groupName'])])
np.save('/Users/Seth/Documents/DSI/Capstone/big-data/X-single-1k.npy', X)
X.shape

100%|██████████| 193/193 [01:16<00:00,  1.21it/s]


(193, 500, 1002)

In [7]:
# OR load X matrix from disk (already processed)
X = np.load('/Users/Seth/Documents/DSI/Capstone/big-data/X-single-1k.npy')
X.shape

(193, 500, 1002)

In [8]:
# construct Y matrix
Y = np.array([np.array([y]) for y in docRanks['rank']])
Y.shape

(193, 1)

## shuffle the data into training and testing

In [9]:
# set the seed if you want to
np.random.seed(123)
#
shuf = docRanks.sample(frac=1).index
shuf

Int64Index([ 72,   4,  79, 171, 112,  71,  90,  93, 182, 147,
            ...
             96,  57, 123, 106,  83,  17,  98,  66, 126, 109],
           dtype='int64', length=193)

In [10]:
splitPoint = int(len(shuf) * .8)
#train = docRanks.iloc[shuf[:splitPoint]]
#test = docRanks.iloc[shuf[(splitPoint+1):]]

In [11]:
X_train = X[shuf[:splitPoint]]
Y_train = Y[shuf[:splitPoint]]

X_test = X[shuf[(splitPoint+1):]]
Y_test = Y[shuf[(splitPoint+1):]]

## convolution1d first layer

In [12]:
model = Sequential()
model.add(Convolution1D(64, 5, border_mode='same', input_shape=(X.shape[1],X.shape[2]), activation='relu'))
model.add(Convolution1D(32, 3, border_mode='same', activation='relu'))
model.add(AveragePooling1D(pool_length=2))
model.add(Convolution1D(32, 3, border_mode='same', activation='relu'))
model.add(MaxPooling1D(pool_length=2))


In [13]:
model.add(Flatten())  # this converts our 3D feature maps to 1D feature vectors
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(0.25))
#
#model.add(Dense(9, activation='softmax')) # for categorical_cross_entropy, below (classification)
model.add(Dense(1)) # for mse, below (regression)

In [14]:
def DSIacc(y_true, y_pred):
    return float(len([i for i in range(len(y_pred)) if abs(y_true[i][0]-y_pred[i][0])<=1])/float(len(y_pred)))

In [15]:
#model.compile(loss='categorical_cross_entropy', optimizer='rmsprop', metrics=['accuracy'])
model.compile(loss='mean_squared_error', optimizer='sgd', metrics=['mean_absolute_error'])

In [16]:
#model.fit(data, labels, nb_epoch=10, batch_size=32) ### generic call from documentation
model.fit(X_train, Y_train, nb_epoch=10, batch_size=32)
#model.fit(X_train, Y_train, nb_epoch=10, batch_size= 1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x11762c610>

### evaluate

In [18]:
model.evaluate(X_test, Y_test, batch_size=1)



[4.6987999804979683, 1.8449475137810958]

In [19]:
y_pred = model.predict(X_test)

In [20]:
DSIacc(Y_test, y_pred)


0.2631578947368421