In [1]:
import numpy as np 
import pandas as pd 

In [2]:
from keras.models import Sequential, load_model
from keras.layers import LSTM, GRU
from keras.layers import Dense, Embedding, Bidirectional, Dropout, Flatten
from keras.optimizers import Adam, SGD
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [3]:
train = pd.read_csv('Rtrain.csv')
test = pd.read_csv('Rtest.csv')

In [4]:
train = train.drop(['id', 'keyword', 'location'], axis=1)
test = test.drop(['id', 'keyword', 'location'], axis=1)

In [5]:
y_train =  train['target'].values
X_train = train.drop(['target'], axis=1).values.reshape(len(train),)
X_test = test['text'].values.reshape(len(test),)

In [6]:
total_tweets = np.concatenate((X_train, X_test))
print('Total tweets : ', len(total_tweets))

Total tweets :  10876


In [13]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(total_tweets)

vocab_size = len(tokenizer.word_index) + 1

In [8]:
maxlen = max(len(x.split()) for x in total_tweets)

In [12]:
X_train_token = tokenizer.texts_to_sequences(X_train)
X_test_token = tokenizer.texts_to_sequences(X_test)

In [11]:
X_train_pad = pad_sequences(X_train_token, maxlen=maxlen, padding='post')
X_test_pad = pad_sequences(X_test_token, maxlen=maxlen, padding='post')

In [14]:
hidden_units = 128
embed_units = 100

model = Sequential()
model.add(Embedding(vocab_size, embed_units, input_length = maxlen))
model.add(Bidirectional(LSTM(hidden_units)))
model.add(Dropout(0.2))
#model.add(Flatten())
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 31, 100)           2932000   
_________________________________________________________________
bidirectional (Bidirectional (None, 256)               234496    
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 256)               65792     
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 257       
Total params: 3,232,545
Trainable params: 3,232,545
Non-trainable params: 0
______________________________________________

In [15]:
learning_rate = 0.0001

model.compile(loss = 'binary_crossentropy',
              optimizer = 'adam',
              metrics = ['accuracy'])

In [16]:
batch_size = 512
num_itr = 5

model_history = model.fit(X_train_pad, y_train, 
                          batch_size=batch_size, 
                          epochs=num_itr, 
                          validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [19]:
pred = model.predict(X_test_pad)
print (pred)

[[0.30806148]
 [0.52412975]
 [0.9922689 ]
 ...
 [0.9779618 ]
 [0.99191785]
 [0.98191667]]
