In [30]:
#import the relevant packages
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, GRU, Dropout
from keras.initializers import Constant

In [41]:
#laad test and train data
train = pd.read_csv("train.csv", index_col = 'id')
test = pd.read_csv("test.csv")

In [42]:
#drop columns not needed
train = train.drop(['keyword','location'], axis = 1)
X_test = test.drop(['keyword','location'], axis = 1)

In [43]:
# create feature and target
X_train = train['text'].values
y_train= train['target'].values
X_test = X_test['text'].values

In [44]:
#prepare data in a format that is suitable for training
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

In [45]:
#convert text to sequences
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [46]:
#pad sequences based on sentence length
#maximum length of a tweet
sentence_len = 280
X_train = pad_sequences(X_train, maxlen = sentence_len)
X_test = pad_sequences(X_test, maxlen = sentence_len)

In [47]:
#word to vec dimension
wordvec_dim = 300

In [19]:
#vocabulary size
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 22700 unique tokens.


In [20]:
#train without transfer learning
# Build and compile the model
model = Sequential()
model.add(Embedding(len(word_index) + 1, wordvec_dim, trainable=True, input_length=sentence_len))
model.add(LSTM(64, return_sequences=True, dropout=0.2, recurrent_dropout=0.15))
model.add(LSTM(64, return_sequences=False, dropout=0.2, recurrent_dropout=0.15))
model.add(Dense(16))
model.add(Dropout(rate=0.25))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])




Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [21]:
#train
model.fit(X_train,  y_train, epochs=3, batch_size=32)




Epoch 1/3





Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1646dcaf6c8>

In [49]:
#make predictions
pred_probabilities = model.predict(X_test)
#convert probabilities to 1 and 0
y_pred = np.matrix( pred_probabilities)
y_pred[y_pred > 0.5] = 1
y_pred[y_pred <= 0.5] = 0

In [62]:
#save predictions
pred = pd.DataFrame(y_pred)
ids = pd.DataFrame(test[['id']])
df = pd.concat([ids, pred], axis = 1)

In [73]:
df = pd.concat([ids, pred], axis = 1)

In [74]:
df.columns = ['id', 'target']

In [76]:
df.set_index('id', inplace=True)

In [77]:
df.to_csv("submission_three.csv")

In [None]:
#submit
#kaggle competitions submit -c nlp-getting-started -f submission_three.csv -m "2ndsubmission"