In [1]:
import tensorflow as tf
import keras

Using TensorFlow backend.


In [2]:
import numpy as np
import pandas as pd 
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell #Show all consecutive outputs
InteractiveShell.ast_node_interactivity = "all"

#Removes all unnecessary warnings by Python
import warnings
warnings.filterwarnings('ignore')

from collections import defaultdict
import json

import scipy as sp

from matplotlib import rcParams
import matplotlib.cm as cm
import matplotlib as mpl

#colorbrewer2 Dark2 qualitative color table
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
                (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
                (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
                (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
                (0.4, 0.6509803921568628, 0.11764705882352941),
                (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
                (0.6509803921568628, 0.4627450980392157, 0.11372549019607843)]

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'white'
rcParams['patch.facecolor'] = dark2_colors[0]
rcParams['font.family'] = 'StixGeneral'

In [31]:
import re
def clean_str(string):
    string = re.sub(r"[^A-Za-z']", " ", string)
    string = re.sub(r"\'s", "\'s", string)
    string = re.sub(r"\'ve", " have", string)
    string = re.sub(r"n\'t", " not", string)
    string = re.sub(r"\'re", " are", string)
    string = re.sub(r"\'d", "\'d", string)
    string = re.sub(r"\'ll", "will", string)
    string = re.sub(r",", "", string)
    string = re.sub(r"!", "", string)
    string = re.sub(r"\(", "", string)
    string = re.sub(r"\)", "", string)
    string = re.sub(r"\?", "", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

In [40]:
sentence_df=pd.read_csv("sentence_df.csv")

In [45]:
sentence_df.head()

Unnamed: 0,Sentence,sentiment,source
0,"A very, very, very slow-moving, aimless movie ...",0,imdb
1,Not sure who was more lost - the flat characte...,0,imdb
2,Attempting artiness with black & white and cle...,0,imdb
3,Very little music or anything to speak of.,0,imdb
4,The best scene in the movie was when Gerardo i...,1,imdb


In [46]:
sentence_df['Sentence']=sentence_df['Sentence'].apply(lambda x: clean_str(x))

In [47]:
sentence_df.head()

Unnamed: 0,Sentence,sentiment,source
0,a very very very slow moving aimless movie abo...,0,imdb
1,not sure who was more lost the flat characters...,0,imdb
2,attempting artiness with black white and cleve...,0,imdb
3,very little music or anything to speak of,0,imdb
4,the best scene in the movie was when gerardo i...,1,imdb


In [56]:
max_sequence_length=max([len(x.split(" ")) for x in sentence_df['Sentence']])

In [58]:
print max_sequence_length

73


In [74]:
def load_data_and_labels(sentence_df):
    # Load data from files
    positive_examples = list(sentence_df[sentence_df['sentiment']==1]['Sentence'])
    positive_examples = [s.strip() for s in positive_examples]
    negative_examples = list(sentence_df[sentence_df['sentiment']==0]['Sentence'])
    negative_examples = [s.strip() for s in negative_examples]
    # Split by words
    x_text = positive_examples + negative_examples
    x_text = [clean_str(sent) for sent in x_text]
    x_text = [s.split(" ") for s in x_text]
    # Generate labels
    positive_labels = [[0, 1] for _ in positive_examples]
    negative_labels = [[1, 0] for _ in negative_examples]
    y = np.concatenate([positive_labels, negative_labels], 0)
    return [x_text, y]

In [75]:
def pad_sentences(sentences, padding_word="<PAD/>"):
    sequence_length = max(len(x) for x in sentences)
    padded_sentences = []
    for i in range(len(sentences)):
        sentence = sentences[i]
        num_padding = sequence_length - len(sentence)
        new_sentence = sentence + [padding_word] * num_padding
        padded_sentences.append(new_sentence)
    return padded_sentences

In [76]:
from collections import Counter
import itertools
def build_vocab(sentences):
    # Build vocabulary
    word_counts = Counter(itertools.chain(*sentences))
    # Mapping from index to word
    vocabulary_inv = [x[0] for x in word_counts.most_common()]
    # Mapping from word to index
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
    return [vocabulary, vocabulary_inv]

In [77]:
def build_input_data(sentences, labels, vocabulary):
    x = np.array([[vocabulary[word] for word in sentence] for sentence in sentences])
    y = np.array(labels)
    return [x, y]

In [78]:
def load_data(sentence_df):
    # Load and preprocess data
    sentences, labels = load_data_and_labels(sentence_df)
    sentences_padded = pad_sentences(sentences)
    vocabulary, vocabulary_inv = build_vocab(sentences_padded)
    x, y = build_input_data(sentences_padded, labels, vocabulary)
    return [x, y, vocabulary, vocabulary_inv]


In [79]:
x, y, vocabulary, vocabulary_inv_list =load_data(sentence_df)

In [84]:
vocabulary_inv = {key: value for key, value in enumerate(vocabulary_inv_list)}
y = y.argmax(axis=1)
# Shuffle data
shuffle_indices = np.random.permutation(np.arange(len(y)))
x = x[shuffle_indices]
y = y[shuffle_indices]
train_len = int(len(x) * 0.9)
x_train = x[:train_len]
y_train = y[:train_len]
x_test = x[train_len:]
y_test = y[train_len:]

In [85]:
print("x_train shape",x_train.shape)
print("x_test shape",x_test.shape)
print("vocab size",len(vocabulary_inv))

('x_train shape', (2700, 73))
('x_test shape', (300, 73))
('vocab size', 5608)


In [86]:
sequence_length=x_test.shape[1]
embedding_weights=None

In [90]:
from keras.models import Model
from keras.layers import Dense, Dropout, Flatten, Input, MaxPooling1D, Convolution1D, Embedding

In [93]:
embedding_dim=300
input_shape = (sequence_length,)
model_input = Input(shape=input_shape)
z = Embedding(len(vocabulary_inv), embedding_dim, input_length=sequence_length, name="embedding")(model_input)
z=Convolution1D(64, 3, border_mode='same')(z)
z=max_po
z=Convolution1D(32, 3, border_mode='same')(z)
z=Convolution1D(16, 3, border_mode='same')(z)
z=Flatten()(z)
z=Dropout(0.2)(z)
z=Dense(180,activation='sigmoid')(z)
z=Dropout(0.2)(z)
model_output=Dense(1,activation='sigmoid')(z)
model = Model(model_input, model_output)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=64, epochs=5,validation_data=(x_test, y_test), verbose=2)

Train on 2700 samples, validate on 300 samples
Epoch 1/20
5s - loss: 0.7092 - acc: 0.4922 - val_loss: 0.6188 - val_acc: 0.6933
Epoch 2/20
5s - loss: 0.4177 - acc: 0.8141 - val_loss: 0.4555 - val_acc: 0.7800
Epoch 3/20
5s - loss: 0.1363 - acc: 0.9570 - val_loss: 0.5939 - val_acc: 0.7867
Epoch 4/20
5s - loss: 0.0708 - acc: 0.9774 - val_loss: 0.7119 - val_acc: 0.7867
Epoch 5/20
5s - loss: 0.0471 - acc: 0.9867 - val_loss: 0.7522 - val_acc: 0.7733
Epoch 6/20
5s - loss: 0.0252 - acc: 0.9941 - val_loss: 0.9133 - val_acc: 0.7733
Epoch 7/20
5s - loss: 0.0141 - acc: 0.9967 - val_loss: 1.0294 - val_acc: 0.7867
Epoch 8/20
5s - loss: 0.0101 - acc: 0.9981 - val_loss: 1.0682 - val_acc: 0.7800
Epoch 9/20
6s - loss: 0.0073 - acc: 0.9993 - val_loss: 1.0983 - val_acc: 0.7867
Epoch 10/20
5s - loss: 0.0061 - acc: 0.9993 - val_loss: 1.1288 - val_acc: 0.7900
Epoch 11/20
5s - loss: 0.0058 - acc: 0.9993 - val_loss: 1.1470 - val_acc: 0.7833
Epoch 12/20
5s - loss: 0.0062 - acc: 0.9993 - val_loss: 1.1650 - val_ac

<keras.callbacks.History at 0x7f146c5abd90>