In [1]:
import tensorflow as tf
import keras

Using TensorFlow backend.


In [2]:
import numpy as np
import pandas as pd 
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell #Show all consecutive outputs
InteractiveShell.ast_node_interactivity = "all"

#Removes all unnecessary warnings by Python
import warnings
warnings.filterwarnings('ignore')

from collections import defaultdict
import json

import scipy as sp

from matplotlib import rcParams
import matplotlib.cm as cm
import matplotlib as mpl

#colorbrewer2 Dark2 qualitative color table
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
                (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
                (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
                (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
                (0.4, 0.6509803921568628, 0.11764705882352941),
                (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
                (0.6509803921568628, 0.4627450980392157, 0.11372549019607843)]

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'white'
rcParams['patch.facecolor'] = dark2_colors[0]
rcParams['font.family'] = 'StixGeneral'

In [3]:
import re
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()


In [4]:
sentence_df=pd.read_csv("sentence_df.csv")

In [5]:
def load_data_and_labels(sentence_df):
    # Load data from files
    positive_examples = list(sentence_df[sentence_df['sentiment']==1]['Sentence'])
    positive_examples = [s.strip() for s in positive_examples]
    negative_examples = list(sentence_df[sentence_df['sentiment']==0]['Sentence'])
    negative_examples = [s.strip() for s in negative_examples]
    # Split by words
    x_text = positive_examples + negative_examples
    x_text = [clean_str(sent) for sent in x_text]
    x_text = [s.split(" ") for s in x_text]
    # Generate labels
    positive_labels = [[0, 1] for _ in positive_examples]
    negative_labels = [[1, 0] for _ in negative_examples]
    y = np.concatenate([positive_labels, negative_labels], 0)
    return [x_text, y]

In [6]:
def pad_sentences(sentences, padding_word="<PAD/>"):
    sequence_length = max(len(x) for x in sentences)
    padded_sentences = []
    for i in range(len(sentences)):
        sentence = sentences[i]
        num_padding = sequence_length - len(sentence)
        new_sentence = sentence + [padding_word] * num_padding
        padded_sentences.append(new_sentence)
    return padded_sentences

In [7]:
from collections import Counter
import itertools
def build_vocab(sentences):
    # Build vocabulary
    word_counts = Counter(itertools.chain(*sentences))
    # Mapping from index to word
    vocabulary_inv = [x[0] for x in word_counts.most_common()]
    # Mapping from word to index
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
    return [vocabulary, vocabulary_inv]

In [8]:
def build_input_data(sentences, labels, vocabulary):
    x = np.array([[vocabulary[word] for word in sentence] for sentence in sentences])
    y = np.array(labels)
    return [x, y]

In [9]:
def load_data(sentence_df):
    # Load and preprocess data
    sentences, labels = load_data_and_labels(sentence_df)
    sentences_padded = pad_sentences(sentences)
    vocabulary, vocabulary_inv = build_vocab(sentences_padded)
    x, y = build_input_data(sentences_padded, labels, vocabulary)
    return [x, y, vocabulary, vocabulary_inv]


In [10]:
x, y, vocabulary, vocabulary_inv_list =load_data(sentence_df)

In [11]:
vocabulary_inv = {key: value for key, value in enumerate(vocabulary_inv_list)}
y = y.argmax(axis=1)
# Shuffle data
shuffle_indices = np.random.permutation(np.arange(len(y)))
x = x[shuffle_indices]
y = y[shuffle_indices]
train_len = int(len(x) * 0.9)
x_train = x[:train_len]
y_train = y[:train_len]
x_test = x[train_len:]
y_test = y[train_len:]

In [12]:
print("x_train shape",x_train.shape)
print("x_test shape",x_test.shape)
print("vocab size",len(vocabulary_inv))

('x_train shape', (2700, 81))
('x_test shape', (300, 81))
('vocab size', 5646)


In [13]:
sequence_length=x_test.shape[1]
embedding_weights=None

In [14]:
x.shape

(3000, 81)

In [15]:
from keras.models import Model
from keras.layers import Dense, Dropout, Flatten, Input, MaxPooling1D, Convolution1D, Embedding
from keras.layers.merge import Concatenate

In [16]:
embedding_dim=50
dropout_prob = (0.5, 0.8)
num_filters =50
hidden_dims=50
filter_sizes = (3,4,5)
#training params
batch_size = 64
num_epochs = 20

input_shape = (sequence_length,)
model_input = Input(shape=input_shape)
z = Embedding(len(vocabulary_inv), embedding_dim, input_length=sequence_length, name="embedding")(model_input)
z = Dropout(dropout_prob[0])(z)

#convolutional
conv_blocks = []
for sz in filter_sizes:
    conv = Convolution1D(filters=num_filters,kernel_size=sz,padding="valid",activation="relu",strides=1)(z)
    conv = MaxPooling1D(pool_size=2)(conv)
    conv = Flatten()(conv)
    conv_blocks.append(conv)
z = Concatenate()(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]
z = Dropout(dropout_prob[1])(z)
z = Dense(hidden_dims, activation="relu")(z)
model_output = Dense(1, activation="sigmoid")(z)

model = Model(model_input, model_output)
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
model.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs,validation_data=(x_test, y_test), verbose=2)

Train on 2700 samples, validate on 300 samples
Epoch 1/20
3s - loss: 0.6939 - acc: 0.4889 - val_loss: 0.6927 - val_acc: 0.5000
Epoch 2/20
4s - loss: 0.6931 - acc: 0.5041 - val_loss: 0.6877 - val_acc: 0.5700
Epoch 3/20
3s - loss: 0.6353 - acc: 0.6678 - val_loss: 0.5798 - val_acc: 0.7267
Epoch 4/20
3s - loss: 0.3933 - acc: 0.8481 - val_loss: 0.5466 - val_acc: 0.7300
Epoch 5/20
3s - loss: 0.2600 - acc: 0.8978 - val_loss: 0.5310 - val_acc: 0.7733
Epoch 6/20
3s - loss: 0.1512 - acc: 0.9459 - val_loss: 0.5833 - val_acc: 0.7667
Epoch 7/20
3s - loss: 0.1195 - acc: 0.9578 - val_loss: 0.5819 - val_acc: 0.7800
Epoch 8/20
3s - loss: 0.0837 - acc: 0.9715 - val_loss: 0.6180 - val_acc: 0.7800
Epoch 9/20
3s - loss: 0.0630 - acc: 0.9822 - val_loss: 0.6395 - val_acc: 0.8033
Epoch 10/20
3s - loss: 0.0509 - acc: 0.9837 - val_loss: 0.6819 - val_acc: 0.7933
Epoch 11/20
3s - loss: 0.0405 - acc: 0.9863 - val_loss: 0.7191 - val_acc: 0.7667
Epoch 12/20
3s - loss: 0.0382 - acc: 0.9837 - val_loss: 0.7715 - val_ac

<keras.callbacks.History at 0x7f45411c5fd0>