Preamble

In [1]:
from __future__ import division
import tensorflow as tf
import numpy as np
import tarfile
import os
import matplotlib.pyplot as plt
import time
import pandas as pd

Import Data

In [2]:
def csv_to_numpy_array(filePath, delimiter):
    return np.genfromtxt(filePath, delimiter=delimiter, dtype=None)

In [3]:
# X is the TF matrix, Y is the binary response
trainX = csv_to_numpy_array("/Users/leighharton/Documents/MSDS/data_dsicap/trainX.csv", delimiter=",")
trainY = csv_to_numpy_array("/Users/leighharton/Documents/MSDS/data_dsicap/trainY.csv", delimiter=",")
testX = csv_to_numpy_array("/Users/leighharton/Documents/MSDS/data_dsicap/testX.csv", delimiter=",")
testY = csv_to_numpy_array("/Users/leighharton/Documents/MSDS/data_dsicap/testY.csv", delimiter=",")

In [4]:
# remove the index column from dataset
trainX = trainX[:,1:707]
trainY = trainY[:,1:707]
testX = testX[:,1:237]
testY = testY[:,1:237]

In [6]:
# remove column headers
trainX = trainX[1:707]
trainY = trainY[1:707]
testX = testX[1:237]
testY = testY[1:237]

Global Parameters

In [7]:
# DATA SET PARAMETERS
# get dimensions for different variables and placeholders:
# numFeatures = the number of words extracted from each article
numFeatures = trainX.shape[1]
# numLabels = number of classes we're predicting (here it's just 2 - Dorothy Day and Rabbinic)
numLabels = trainY.shape[1]

In [8]:
# TRAINING SESSION PARAMETERS
# number of times we iterate through training data
# tensorboard shows that accuracy plateaus at ~25k epochs
numEpochs = 27000
# a smarter learning rate for gradientOptimizer
learningRate = tf.train.exponential_decay(learning_rate = 0.0008,
                                          global_step =1,
                                          decay_steps = trainX.shape[0],
                                          decay_rate = 0.95,
                                          staircase = True)

Placeholders

In [9]:
# X = X-matrix/feature-matrix/data-matrix...It's a tensor to hold the article data. 'None' here 
# means that we can hold any number of articles
X = tf.placeholder(tf.float32, [None, numFeatures])
# yGold = Y-matrix/label-matrix/labels...This will be our correct answers matrix. Every row has
# either [1,0] for Dorothy Day or [0,1] for Rabbinic. 'None' here means that we can
# hold any number of emails
yGold = tf.placeholder(tf.float32, [None, numLabels])

Variables

In [10]:
# Values are randomly sampled from a Gaussian with a standard deviation of:
# sqrt(6/(numInputNodes + numOutputNodes + 1))
weights = tf.Variable(tf.random_normal([numFeatures,numLabels],
                                        mean = 0,
                                        stddev = (np.sqrt(6/numFeatures + numLabels + 1)),
                                        name = "weights"))
bias = tf.Variable(tf.random_normal([1,numLabels],
                                     mean = 0,
                                     stddev = (np.sqrt(6/numFeatures + numLabels + 1)),
                                     name = "bias"))

Ops

In [11]:
######################
### PREDICTION OPS ###
######################
# INITIALIZE our weights and biases
init_OP = tf.initialize_all_variables()
# PREDICTION ALGORITHM  i.e. FEEDFORWARD ALGORITHM
apply_weights_OP = tf.matmul(X, weights, name = "apply_weights")
add_bias_OP = tf.add(apply_weights_OP, bias, name = "add_bias")
activation_OP = tf.nn.sigmoid(add_bias_OP, name = "activation")

In [13]:
#####################
### EVALUATION OP ###
#####################
# COST FUNCTION i.e. MEAN SQUARED ERROR
cost_OP = tf.nn.l2_loss(activation_OP-yGold, name = "squared_error_cost")

In [14]:
#######################
### OPTIMIZATION OP ###
#######################
# OPTIMIZATION ALGORITHM i.e. GRADIENT DESCENT
training_OP = tf.train.GradientDescentOptimizer(learningRate).minimize(cost_OP)

Computational Graph