In [64]:
#prepare data and class variables
import pandas as pd
import matplotlib.pyplot as plt
from keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import one_hot, text_to_word_sequence
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
%matplotlib inline

tweets_df = pd.read_csv('../tweets.csv')

print('positive', len(tweets_df[tweets_df['airline_sentiment'] == 'positive'])/len(tweets_df))
print('neutral', len(tweets_df[tweets_df['airline_sentiment'] == 'neutral'])/len(tweets_df))
print('negative', len(tweets_df[tweets_df['airline_sentiment'] == 'negative'])/len(tweets_df))

tweets_df[tweets_df['airline_sentiment'] == 'positive']['airline_sentiment'] = 0
tweets_df[tweets_df['airline_sentiment'] == 'neutral']['airline_sentiment'] = 1
tweets_df[tweets_df['airline_sentiment'] == 'positive']['airline_sentiment'] = 2

X = tweets_df['text'].tolist()
y = tweets_df['airline_sentiment'].tolist()

encoder = LabelEncoder()
encoder.fit(y)
encoded_Y = encoder.transform(y)
dummy_y = np_utils.to_categorical(encoded_Y)
y = dummy_y

vocab_size = 0
for entry in X:
    words = set(text_to_word_sequence(entry))
    vocab_size += len(words)
    
X = [one_hot(entry, round(vocab_size*1.3)) for entry in X]

X_train, X_test, y_train, y_test = train_test_split(X, dummy_y, random_state=42)

X_train = sequence.pad_sequences(X_train, maxlen=500)
X_test = sequence.pad_sequences(X_test, maxlen=500)
y_train = np.asarray(y_train)
y_test = np.asarray(y_test)


positive 0.16140710382513662
neutral 0.21168032786885246
negative 0.6269125683060109


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [65]:
#modeling

from keras.models import Sequential, Input, Model
from keras.layers import Dense
from keras.layers import SimpleRNN, LSTM, GRU
from keras.layers.embeddings import Embedding

EMBED_SIZE = 50
rnns = []
input_holder = Input(shape=(X_train.shape[1], ))
input_embed = Embedding(vocab_size,
                       50,
                       input_length=500)(input_holder)
x = SimpleRNN(25, dropout=0.2, recurrent_dropout=0.2)(input_embed)
x = Dense(3, activation='sigmoid')(x)
rnn=Model(inputs=input_holder,outputs=x)
rnn.compile(loss='binary_crossentropy', 
              optimizer='rmsprop', 
              metrics=['accuracy'])
rnn.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_19 (InputLayer)        (None, 500)               0         
_________________________________________________________________
embedding_17 (Embedding)     (None, 500, 50)           12416200  
_________________________________________________________________
simple_rnn_16 (SimpleRNN)    (None, 25)                1900      
_________________________________________________________________
dense_27 (Dense)             (None, 3)                 78        
Total params: 12,418,178
Trainable params: 12,418,178
Non-trainable params: 0
_________________________________________________________________


In [66]:
input_holder = Input(shape=(X_train.shape[1], ))
input_embed = Embedding(vocab_size,
                       50,
                       input_length=500)(input_holder)
for func in [SimpleRNN, LSTM, GRU]:
    
    x = func(50, dropout=0.2, recurrent_dropout=0.2)(input_embed)
    x = Dense(3, activation='sigmoid')(x)
    rnn=Model(inputs=input_holder, outputs=x)
    rnn.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(rnn.summary())
    rnns.append(rnn)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_20 (InputLayer)        (None, 500)               0         
_________________________________________________________________
embedding_18 (Embedding)     (None, 500, 50)           12416200  
_________________________________________________________________
simple_rnn_17 (SimpleRNN)    (None, 50)                5050      
_________________________________________________________________
dense_28 (Dense)             (None, 3)                 153       
Total params: 12,421,403
Trainable params: 12,421,403
Non-trainable params: 0
_________________________________________________________________
None
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_20 (InputLayer)        (None, 500)               0         
___________________________________________________________

In [None]:
for rnn, name in zip(rnns,['simple','lstm','gru']):
    print('=======',name,'========')
    rnn.fit(X_train, y_train, epochs=20, batch_size=64, validation_data=(X_test, y_test))

Train on 10980 samples, validate on 3660 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20