In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

So in this notebook we're going to try and classify the sentiment of tweets using tf hub

In [None]:
import tensorflow as tf
import tensorflow_hub as hub

from sklearn.model_selection import train_test_split
import numpy as np

In [None]:
df = pd.read_csv('../input/Sentiment.csv')

In [None]:
df.head(5)

In [None]:
df.sentiment.value_counts()

The nice thing about this dataset is includes the sentiment classification confidence

In [None]:
df.sentiment_confidence.plot(kind='hist')

I'm going to (pretty much) arbitrarily remove that lower intival chunk of data because the confidence level is so low

Then take an even split of the 3 classifications

In [None]:
df = df[df['sentiment_confidence'] > 0.5]

df_pos = df[df['sentiment'] == 'Positive'].sample(frac=1)
df_neg = df[df['sentiment'] == 'Negative'].sample(frac=1)
df_neu = df[df['sentiment'] == 'Neutral'].sample(frac=1)

sample_size = min(len(df_pos), len(df_neg), len(df_neu))

df_ = pd.concat([df_pos.head(sample_size), df_neg.head(sample_size), df_neu.head(sample_size)])[['text', 'sentiment']]

In [None]:
del df
sample_size

In [None]:
import re

def clean_str(string):
    """
    Tokenization/string cleaning for datasets.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = str(string)
    # remove the retweet part - maybe this should just be removed
    if string[:4] == 'RT @':
        tmp = string.find(':')
        string = string[tmp + 2:]
    string = re.sub(r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9]\.[^\s]{2,})", "url", string)
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

As you can see from above, this data needs a bit of cleaning - I'm going to borrow some code and adapt it to clean the data

In [None]:
df_['clean'] = df_['text'].apply(clean_str)

So this is what our final dataset looks like

In [None]:
df_.head(3)

tf hub has this great set of text embeddings, I'm going to use the universal sentence encoder here - this step can take a while to run because it has to download the embeddings from tf hub (note if you're in a kaggle notebook you have to auth internet access)

In [None]:
text_embeddings  = hub.text_embedding_column(
    "clean", 
    module_spec="https://tfhub.dev/google/universal-sentence-encoder/2",
    trainable=True
)

Split the data into a training and testing set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_['clean'], df_['sentiment'], test_size=0.3, random_state=42)

Then we need to create a `head`. This is what the estimator will train against.

Here, our data is one of 3 classes: Positive, Negative or Neutral. If it could be a mix of more than one class we would use a `multi_label_head`, but since we can only be one class then we use the `muti_class_head`

Note the `label_vocabulary` here - this tells the head that these are the classes used. If you don't specify this then you get all sorts of cryptic failures when you try to train the estimator

In [None]:
multi_class_head  = tf.contrib.estimator.multi_class_head(
    n_classes=3,
    loss_reduction=tf.losses.Reduction.SUM_OVER_BATCH_SIZE,
    label_vocabulary=['Positive', 'Neutral', 'Negative']
)

At this point we get to pick the architecture of our NN.

Along with passing our head, we also get to specify how many hidden units we have an the optimizer (amoung other things if you are so inclined)

The general rule of thumb for fully connected networks is you should go down in size as the layers progress - but play about, you could get better results trying something else

In [None]:
estimator  = tf.contrib.estimator.DNNEstimator(
    head=multi_class_head,
    hidden_units=[256, 128, 64],
    feature_columns=[text_embeddings],
    optimizer=tf.train.AdamOptimizer()
)

Then we have to specify our dictionary for tf

In [None]:
features = {
  "clean": np.array(X_train)
}
labels = np.array(y_train)

train_input_fn = tf.estimator.inputs.numpy_input_fn(
    features, 
    labels, 
    shuffle=True, 
    batch_size=32, 
    num_epochs=20
)

Then let's train the thing!

In [None]:
estimator.train(input_fn=train_input_fn)

We can then evaluate how good this model is with `estimator.evaluate`

In [None]:
eval_input_fn  = tf.estimator.inputs.numpy_input_fn({"clean": np.array(X_test).astype(np.str)}, np.array(y_test), shuffle=False)

estimator.evaluate(input_fn=eval_input_fn)