In [14]:
import pandas as pd
import numpy as np
from collections import Counter
from tqdm import tqdm
import pickle

from keras.utils import to_categorical
from keras.models import Model, Sequential
from keras.layers import Input, Embedding, LSTM, Dropout, Dense, Reshape, Concatenate
from keras.optimizers import Adam

##### Load data and select only tweets with positive and negative sentiment: 

In [15]:
data = pd.read_csv('twitter-airline-sentiment.csv')
data = data[['text','airline','airline_sentiment']]

data = data[data['airline_sentiment'].str.contains(
    '|'.join(['positive','negative']))].reset_index(drop=True)

##### Train / Test split:

In [16]:
p_train = 0.8 # proportion in train.
inTrain = np.random.choice(len(data), int(p_train*len(data)), replace=False)
train = data.iloc[inTrain].reset_index(drop=True)
valid = data.drop(inTrain).reset_index(drop=True)

##### Tokenizer for text feature:
Determine `vocab_size` most frequent words and replace their occurrence with a number. Do this for the first `seq_length` words per tweet. An n-dimensional embedding is estimated later as part of the model.

In [17]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

vocab_size = 5000
seq_length = 12
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(train['text'].values)

##### Encoder for categorical feature:
Replace each categorical level with a number. Again, an n-dimensional embedding is estimated later as part of the model.

In [18]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoder.fit(train['airline'])
num_classes = len(encoder.classes_)

##### Apply pre-processing steps (i.e. tokenizer and encoder) to train and test:

In [19]:
sequences = tokenizer.texts_to_sequences(train['text'].values)
X_train_LSTM = np.array(pad_sequences(sequences, 
                                      maxlen=seq_length, 
                                      padding='pre', 
                                      truncating='post'))
X_train_FC = encoder.transform(train['airline'])
y_train = train['airline_sentiment'].str.contains('negative').astype(int).values

In [20]:
print(str(sum(np.sum(X_train_LSTM, axis=1)==0))+' of '+str(len(X_train_LSTM))+
      ' rows do not contain at least one of the words in the vocabulary.')

0 of 9232 rows do not contain at least one of the words in the vocabulary.


In [21]:
sequences = tokenizer.texts_to_sequences(valid['text'].values)
X_valid_LSTM = np.array(pad_sequences(sequences, 
                                     maxlen=seq_length, 
                                     padding='pre', 
                                     truncating='post'))
X_valid_FC = encoder.transform(valid['airline'])
y_valid = valid['airline_sentiment'].str.contains('negative').astype(int).values 

##### Specify model:
The model contains two components that are concatenated. The first component includes an Embedding layer, a LSTM layer and a Dropout layer and receives the tokenized text as its input. The second component includes an Embedding layer and a Dropout layer and receives the encoded categorical data as its input. After concatenation a Dense layer is added to map the outcomes of both components to the final target (e.g. sentiment; positive / negative).   

In [22]:
LSTM_embed_dim = 32 
LSTM_input = Input(shape=(seq_length,), name='LSTM_input')
LSTM_embed = Embedding(input_dim=vocab_size, input_length=seq_length, 
                       output_dim=LSTM_embed_dim, name='LSTM_embed')(LSTM_input)
LSTM_layer = LSTM(64, name='LSTM_output')(LSTM_embed)
LSTM_regul = Dropout(rate=0.5, name='LSTM_dropout')(LSTM_layer)

In [23]:
FC_embed_dim = 3
FC_input = Input(shape=(1,), name='FC_input')
FC_embed = Embedding(input_dim=num_classes, input_length=1, 
                     output_dim=3, name='FC_embed')(FC_input)
FC_layer = Reshape(target_shape=(3,), name='FC_output')(FC_embed)
FC_regul = Dropout(rate=0.5, name='FC_dropout')(FC_layer)

In [24]:
full_merge = Concatenate()([LSTM_regul, FC_regul])
full_dense = Dense(1, activation='sigmoid', name='full_dense')(full_merge) 
full_model = Model(inputs=[LSTM_input, FC_input], outputs=full_dense)
full_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
LSTM_input (InputLayer)         (None, 12)           0                                            
__________________________________________________________________________________________________
FC_input (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
LSTM_embed (Embedding)          (None, 12, 32)       160000      LSTM_input[0][0]                 
__________________________________________________________________________________________________
FC_embed (Embedding)            (None, 1, 3)         18          FC_input[0][0]                   
__________________________________________________________________________________________________
LSTM_outpu

##### Model compilation:

In [25]:
full_model.compile(loss='binary_crossentropy',
                   optimizer='adam',
                   metrics=['accuracy']) 

##### Model fitting:

In [26]:
full_model.fit([X_train_LSTM, X_train_FC], y_train, epochs=10, batch_size=64,
               validation_data=([X_valid_LSTM, X_valid_FC], y_valid))

Train on 9232 samples, validate on 2309 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f52244f7a90>