In [73]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, GRU, Dropout
from keras.initializers import Constant
import os
import sys

Using TensorFlow backend.


In [74]:
#read data
train = pd.read_csv("train.csv", index_col = 'id')
test = pd.read_csv("test.csv")

In [75]:
#drop columns not needed
train = train.drop(['keyword','location'], axis = 1)
X_test = test.drop(['keyword','location'], axis = 1)

In [76]:
# create feature and target
X_train = train['text'].values
y_train= train['target'].values
X_test = X_test['text'].values
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 21, stratify = y)

In [10]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

In [11]:
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [13]:
#maximum length of a tweet
sentence_len = 280

In [14]:
X_train = pad_sequences(X_train, maxlen = sentence_len)
X_test = pad_sequences(X_test, maxlen = sentence_len)

In [27]:
#get glove embeddings
embeddings_index = {}
f = open('glove.42B.300d.txt','r',  encoding = 'utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 1917494 word vectors.


In [28]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 21038 unique tokens.


In [29]:
EMBEDDING_DIM = 300

In [30]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [72]:
# Create a model with embeddings
model = Sequential(name="emb_model")
model.add(Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=sentence_len,
                            trainable=False))
model.add(GRU(128))
model.add(Dense(1))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Print the summaries of the model with embeddings
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 50, 300)           6311700   
_________________________________________________________________
gru_2 (GRU)                  (None, 128)               164736    
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 129       
Total params: 6,476,565
Trainable params: 164,865
Non-trainable params: 6,311,700
_________________________________________________________________


In [73]:
# Train the model, iterating on the data in batches of 32 samples
model.fit(X_train,  y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

Train on 6851 samples, validate on 762 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1ade0258308>

In [74]:
#print("Loss: {0}\nAccuracy: {1}".format(*model.evaluate(X_test, y_test, verbose=0)))

Loss: 1.445254040515329
Accuracy: 0.6692913390520051


In [75]:
 pred_probabilities = model.predict(X_test)

In [76]:
y_pred = np.matrix( pred_probabilities)
y_pred[y_pred > 0.5] = 1
y_pred[y_pred <= 0.5] = 0

In [None]:
#save predictions
pred = pd.DataFrame(y_pred)
ids = pd.DataFrame(test[['id']])
df = pd.concat([ids, pred], axis = 1)
df.set_index('id', inplace=True)

In [None]:
df.to_csv("submission_two.csv")

In [None]:
#submit
#kaggle competitions submit -c nlp-getting-started -f submission_two.csv -m "2ndsubmission"