# Predicting NPS score based on user feedback

In [152]:
import numpy as np
import tensorflow as tf
import pandas as pd

In [41]:
# load NPS data from SatisMeter
df=pd.read_csv('data/train.csv', sep=',')

# filter out columns other than feedback and rating
df = df.filter(items=["feedback", "rating"])

# drop scores without feedback
df = df.dropna()

In [154]:
df[:10]

Unnamed: 0,feedback,rating
2,everything seems to be fine,8
6,A dedicated iPhone app and the tests for revie...,10
7,Pricing not relevant to the content I've seen.,2
12,keep it up ！！ thanks you all!!,10
16,Add more languages,10
17,I can't unsubscribe from your website.,0
18,we could do test more,10
19,I don't think I would change anything!! :),10
22,ferdy,1
24,Nothing I jast need to get up and lern,1


## Data preprocessing

In [155]:
# removing punctuation
from string import punctuation
df.feedback = df.feedback.str.replace('[^\w\s]','')

In [156]:
# join all feedback and split into words
all_text = ' '.join(df.feedback)
words = all_text.split()

In [157]:
words[:10]

['everything',
 'seems',
 'to',
 'be',
 'fine',
 'A',
 'dedicated',
 'iPhone',
 'app',
 'and']

### Encoding the words

In [158]:
# making a dictionary mapping words to integers
from collections import Counter
counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)} # starting at 1, as we'll later pad input vectors with 0s

# converting feedback to integers
df['feedback_ints'] = [list(map(lambda w: vocab_to_int[w], f)) for f in list(map(str.split, df.feedback))]

In [159]:
feedback_lens = Counter([len(x) for x in df.feedback_ints])
print("Zero-length feedback: {}".format(feedback_lens[0]))
print("Maximum feedback length: {}".format(max(feedback_lens)))

Zero-length feedback: 3
Maximum feedback length: 481


In [160]:
seq_len = 200

# remove zero-length feedback
df = df[df.feedback_ints.apply(lambda x: len(x) > 0)]

# truncate feedback to 200 words
df.feedback_ints = df.feedback_ints.apply(lambda x: x[:seq_len])

# pad feedback longer than 200 words with 0s, add to a features array
features = np.zeros((len(df.feedback_ints), seq_len), dtype=int)
df = df.reset_index(drop=True)
for i, row in df.iterrows():
    features[i, -len(row.feedback_ints):] = row.feedback_ints[:seq_len]

In [161]:
#reshape labels for sparse_softmax_cross_entropy
labels = df.rating.values.reshape([-1, 1])

In [162]:
features[:10, 150:200]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,  216,  282,    2,   14,  231],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,  122,  647, 1179,   61,    5,    1,  305,    9,  648,   72,
           3,  170,   56,   15,   13,    7,    4, 1180,    9,  525,    6,
         649,    5,   13, 1181,   62,  525,    6,   50,  162,  344, 1182,
         123,  171,   28,    1,  448,  252],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,  650,   16,
         449,    2,   

## Training, validation, test



In [163]:
split_frac = 0.8 # fraction of data to keep in the training set
split_idx = int(len(features)*0.8)
train_x, val_x = features[:split_idx], features[split_idx:]
train_y, val_y = labels[:split_idx], labels[split_idx:]

test_idx = int(len(val_x)*0.5)
val_x, test_x = val_x[:test_idx], val_x[test_idx:]
val_y, test_y = val_y[:test_idx], val_y[test_idx:]

print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

			Feature Shapes:
Train set: 		(806, 200) 
Validation set: 	(101, 200) 
Test set: 		(101, 200)


## Build the graph
* `lstm_size`: Number of units in the hidden layers in the LSTM cells. Usually larger is better performance wise. Common values are 128, 256, 512, etc.
* `lstm_layers`: Number of LSTM layers in the network. Start with 1, then add more if I'm underfitting.
* `batch_size`: The number of feedback messages to feed the network in one training pass. Typically this should be set as high as you can go without running out of memory.
* `learning_rate`: Learning rate

In [164]:
lstm_size = 512
lstm_layers = 1
batch_size = 100
learning_rate = 0.001

We'll be passing in our 200 element long feedback vectors. Each batch will be `batch_size` vectors. We'll also be using dropout on the LSTM layer with `keep_prob`.

In [165]:
n_words = len(vocab_to_int) + 1 # adding 1 because we use 0s for padding, dictionary started at 1

# create the graph object
graph = tf.Graph()
# add nodes to the graph
with graph.as_default():
    inputs_ = tf.placeholder(tf.int64, [None, None], name='inputs')
    labels_ = tf.placeholder(tf.int64, [None, 1], name='labels')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')

### Embedding


In [166]:
# size of the embedding vectors (number of units in the embedding layer)
embed_size = 300 

with graph.as_default():
    embedding = tf.Variable(tf.random_uniform((n_words, embed_size), -1, 1))
    embed = tf.nn.embedding_lookup(embedding, inputs_)

### LSTM cell

In [167]:
with graph.as_default():
    # Your basic LSTM cell
    lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
    
    # Add dropout to the cell
    drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
    
    # Stack up multiple LSTM layers, for deep learning
    cell = tf.contrib.rnn.MultiRNNCell([drop] * lstm_layers)
    
    # Getting an initial state of all zeros
    initial_state = cell.zero_state(batch_size, tf.float32)

### RNN forward pass

In [168]:
with graph.as_default():
    outputs, final_state = tf.nn.dynamic_rnn(cell, embed,
                                             initial_state=initial_state)

### Output

In [169]:
with graph.as_default():
    logits = tf.layers.dense(inputs=outputs[:, -1], units=11)
    loss = tf.losses.sparse_softmax_cross_entropy(labels=labels_, logits=logits)
    
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss)

### Validation accuracy

Here we add a few nodes to calculate the accuracy which we'll use in the validation pass.

In [170]:
with graph.as_default():
    correct_prediction = tf.equal(tf.argmax(logits, 1), labels_)
    correct_prediction = tf.cast(correct_prediction, tf.float32)
    accuracy = tf.reduce_mean(correct_prediction)

### Batching

This is a simple function for returning batches from the data. First it removes data such that we only have full batches. Then it iterates through the `x` and `y` arrays and returns slices out of those arrays with size `[batch_size]`.

In [171]:
def get_batches(x, y, batch_size=100):
    
    n_batches = len(x)//batch_size
    x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
    for ii in range(0, len(x), batch_size):
        yield x[ii:ii+batch_size], y[ii:ii+batch_size]

## Training

In [172]:
epochs = 15

with graph.as_default():
    saver = tf.train.Saver()

with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())
    iteration = 1
    for e in range(epochs):
        state = sess.run(initial_state)
        
        for ii, (x, y) in enumerate(get_batches(train_x, train_y, batch_size), 1):
            feed = {inputs_: x,
                    labels_: y,
                    keep_prob: 0.5,
                    initial_state: state}
            loss_val, state, _ = sess.run([loss, final_state, optimizer], feed_dict=feed)
            
            if iteration%5==0:
                print("Epoch: {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Train loss: {:.3f}".format(loss_val))

            if iteration%25==0:
                val_acc = []
                val_state = sess.run(cell.zero_state(batch_size, tf.float32))
                for x, y in get_batches(val_x, val_y, batch_size):
                    feed = {inputs_: x,
                            labels_: y,
                            keep_prob: 1,
                            initial_state: val_state}
                    batch_acc, val_state = sess.run([accuracy, final_state], feed_dict=feed)
                    val_acc.append(batch_acc)
                print("Val acc: {:.3f}".format(np.mean(val_acc)))
            iteration +=1
    saver.save(sess, "checkpoints/nps.ckpt")

Epoch: 0/15 Iteration: 5 Train loss: 1.945
Epoch: 1/15 Iteration: 10 Train loss: 1.727
Epoch: 1/15 Iteration: 15 Train loss: 1.950
Epoch: 2/15 Iteration: 20 Train loss: 1.646
Epoch: 3/15 Iteration: 25 Train loss: 1.634
Val acc: 0.277
Epoch: 3/15 Iteration: 30 Train loss: 1.655
Epoch: 4/15 Iteration: 35 Train loss: 1.475
Epoch: 4/15 Iteration: 40 Train loss: 1.418
Epoch: 5/15 Iteration: 45 Train loss: 1.074
Epoch: 6/15 Iteration: 50 Train loss: 0.763
Val acc: 0.213
Epoch: 6/15 Iteration: 55 Train loss: 1.043
Epoch: 7/15 Iteration: 60 Train loss: 0.901
Epoch: 8/15 Iteration: 65 Train loss: 0.775
Epoch: 8/15 Iteration: 70 Train loss: 0.668
Epoch: 9/15 Iteration: 75 Train loss: 0.644
Val acc: 0.178
Epoch: 9/15 Iteration: 80 Train loss: 0.591
Epoch: 10/15 Iteration: 85 Train loss: 0.468
Epoch: 11/15 Iteration: 90 Train loss: 0.313
Epoch: 11/15 Iteration: 95 Train loss: 0.387
Epoch: 12/15 Iteration: 100 Train loss: 0.325
Val acc: 0.213
Epoch: 13/15 Iteration: 105 Train loss: 0.310
Epoch: 13/

## Testing

In [173]:
test_acc = []
with tf.Session(graph=graph) as sess:
    saver.restore(sess, tf.train.latest_checkpoint('checkpoints'))
    test_state = sess.run(cell.zero_state(batch_size, tf.float32))
    for ii, (x, y) in enumerate(get_batches(test_x, test_y, batch_size), 1):
        feed = {inputs_: x,
                labels_: y,
                keep_prob: 1,
                initial_state: test_state}
        batch_acc, test_state = sess.run([accuracy, final_state], feed_dict=feed)
        test_acc.append(batch_acc)
    print("Test accuracy: {:.3f}".format(np.mean(test_acc)))

INFO:tensorflow:Restoring parameters from checkpoints/nps.ckpt
Test accuracy: 0.219
