In [35]:
import pandas as pd
import numpy as np
import re
import itertools
from collections import Counter

In [12]:
data = pd.read_csv('Train_v1.tsv', sep="\t", header = None)

In [13]:
data.columns = ["ff", "label", "stmt"]

In [14]:
data.head()

Unnamed: 0,ff,label,stmt
0,TrainSen,0,@0430yes i hope youre lurking rn. i want to li...
1,TrainSen,0,05 really taught me a valuable lesson I'm neve...
2,TrainSen,0,"@098BERRY Never had a voice to protest, so you..."
3,TrainSen,0,@0hMySt4rs Rest in peace & love to you and you...
4,TrainSen,0,100 days until Christmas! 🌲 #too soon #not rea...


In [17]:
pos = data[data["label"] == 1]["stmt"]

In [19]:
neg = data[data["label"] == 0]["stmt"]

In [23]:
posl = pos.tolist()
negl = neg.tolist()

In [24]:
def clean_str(string):
    """
    Tokenization/string cleaning for datasets.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

In [32]:
x_text = posl + negl
x_text = [clean_str(sent) for sent in x_text]
x_text = [s.split(" ") for s in x_text]

In [28]:
positive_labels = [1 for _ in posl]
negative_labels = [0 for _ in negl]
y = np.concatenate([positive_labels, negative_labels])

In [33]:
len(x_text), len(y)

(39780, 39780)

In [30]:
sequence_length = max(len(x) for x in x_text)

In [34]:
padded_sentences = []
for i in range(len(x_text)):
    xx = x_text[i]
    num_padding = sequence_length - len(xx)
    new_sentence = xx + ["<PAD/>"] * num_padding
    padded_sentences.append(new_sentence)

In [36]:
word_counts = Counter(itertools.chain(*padded_sentences))
# Mapping from index to word
vocabulary_inv = [x[0] for x in word_counts.most_common()]
vocabulary_inv = list(sorted(vocabulary_inv))
# Mapping from word to index
vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}

In [37]:
x = np.array([[vocabulary[word] for word in sentence] for sentence in padded_sentences])
y = np.array(y)

In [56]:
from keras.layers import Input, Dense, Embedding, Conv2D, MaxPool2D
from keras.layers import Reshape, Flatten, Dropout, Concatenate
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
from keras.models import Model
from keras.models import Sequential, model_from_json
from keras.layers.core import Dropout, Dense, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.layers.convolutional import Convolution1D, MaxPooling1D
from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
# from data_helpers import load_data

In [59]:
X_train, X_test, y_train, y_test = train_test_split(x, yy, test_size=0.15, random_state=42)

In [57]:
sequence_length = x.shape[1]
vocabulary_size = len(vocabulary_inv)
embedding_dim = 256
filter_sizes = [3,4,5]
num_filters = 512
drop = 0.5
epochs = 10
batch_size = 16
hidden_units = 3

In [58]:
yy = []
for i in y:
    if i == 0:
        yy.append([1, 0])
    else:
        yy.append([0, 1])

In [None]:
model = Sequential()
model.add(Embedding(vocabulary_size, embedding_dim, input_length=sequence_length, embeddings_initializer='glorot_normal'))
model.add(Convolution1D(hidden_units, 3, kernel_initializer='he_normal', padding='valid', activation='sigmoid', input_shape=(sequence_length, 1)))
# model.add(MaxPooling1D(pool_size=3))
model.add(Convolution1D(hidden_units, 3, kernel_initializer='he_normal', padding='valid', activation='sigmoid', input_shape=(sequence_length - 2, 1)))
# model.add(MaxPooling1D(pool_size=3))
# model.add(Dropout(0.25))
model.add(LSTM(hidden_units, kernel_initializer='he_normal', activation='sigmoid', dropout=0.5, return_sequences=True))
model.add(LSTM(hidden_units, kernel_initializer='he_normal', activation='sigmoid', dropout=0.5))
model.add(Dense(hidden_units, kernel_initializer='he_normal', activation='sigmoid'))
model.add(Dense(2))
model.add(Activation('softmax'))

In [50]:
# VERY LESS ACCURACY 
print("Creating Model...")
inputs = Input(shape=(sequence_length,), dtype='int32')
embedding = Embedding(input_dim=vocabulary_size, output_dim=embedding_dim, input_length=sequence_length)(inputs)
reshape = Reshape((sequence_length,embedding_dim,1))(embedding)

conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)

maxpool_0 = MaxPool2D(pool_size=(sequence_length - filter_sizes[0] + 1, 1), strides=(1,1), padding='valid')(conv_0)
maxpool_1 = MaxPool2D(pool_size=(sequence_length - filter_sizes[1] + 1, 1), strides=(1,1), padding='valid')(conv_1)
maxpool_2 = MaxPool2D(pool_size=(sequence_length - filter_sizes[2] + 1, 1), strides=(1,1), padding='valid')(conv_2)

concatenated_tensor = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2])
flatten = Flatten()(concatenated_tensor)
dropout = Dropout(drop)(flatten)
output = Dense(units=1, activation='softmax')(dropout)

# this creates a model that includes
model = Model(inputs=inputs, outputs=output)

checkpoint = ModelCheckpoint('weights.{epoch:03d}-{val_acc:.4f}.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='auto')
adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)

Creating Model...


In [51]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_2 (InputLayer)             (None, 47)            0                                            
____________________________________________________________________________________________________
embedding_2 (Embedding)          (None, 47, 256)       9091584     input_2[0][0]                    
____________________________________________________________________________________________________
reshape_2 (Reshape)              (None, 47, 256, 1)    0           embedding_2[0][0]                
____________________________________________________________________________________________________
conv2d_4 (Conv2D)                (None, 45, 1, 512)    393728      reshape_2[0][0]                  
___________________________________________________________________________________________

In [52]:
model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])

In [55]:
print("Traning Model...")
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, 
          callbacks=[checkpoint], validation_data=(X_test, y_test)) # starts training

Traning Model...
Train on 33813 samples, validate on 5967 samples
Epoch 1/10

KeyboardInterrupt: 