In [18]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, InputLayer, Input, GlobalAveragePooling1D, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from google.colab import drive
import pandas as pd
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [19]:
# convert train and test datasets to vectors
path = '/content/drive/MyDrive/Colab Notebooks/Twitter Disaster/nlp-getting-started/'
train = pd.read_csv(path + 'train.csv')
trainX, trainY = train['text'], train['target']
test = pd.read_csv(path + 'test.csv')
testX, id = test['text'], test['id']

In [20]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='OOV')
tokenizer.fit_on_texts(trainX)
tokens = tokenizer.word_index

In [21]:
inp_len = len(max(trainX, key=lambda x: len(x))) + 1
glovelen = 100
print(inp_len)

158


In [22]:
emb_layer = tf.keras.layers.Embedding(input_dim=len(tokens)+1, output_dim=glovelen, input_length=inp_len)

In [23]:
def model(inps):
  inp = Input(inps)
  X = emb_layer(inp)
  X = LSTM(128, return_sequences=True)(X)
  X = Dropout(0.5)(X)
  X = GlobalAveragePooling1D()(X)
  X = BatchNormalization()(X)
  X = Dense(100, activation='relu', kernel_regularizer='l2')(X)
  X = Dense(128, activation='relu', kernel_regularizer='l2')(X)
  out = Dense(1, activation='sigmoid')(X)
  return Model(inputs=inp, outputs=out)

def SimpleModel(inp_len):
  model = Sequential([
                      Dense(150, input_shape=inp_len, activation='relu'),
                      Dense(150, activation='relu'),
                      Dense(1, activation='sigmoid'),
  ])
  return model

In [24]:
test = tokenizer.texts_to_sequences(testX)
test = tf.keras.preprocessing.sequence.pad_sequences(test, maxlen=inp_len, padding='post')
train = tokenizer.texts_to_sequences(trainX)
train = tf.keras.preprocessing.sequence.pad_sequences(train, maxlen=inp_len, padding='post')
mod = model((inp_len,))
mod.compile(optimizer=Adam(learning_rate=0.0001), loss = 'binary_crossentropy', metrics = ['accuracy'])


In [25]:
trainY = np.array(trainY).astype(np.float64)
es = EarlyStopping(monitor='val_loss', mode='min', verbose=0, patience=2, restore_best_weights=True)
mod.fit(train, trainY, batch_size=32, validation_split = 0.1, epochs = 10, callbacks=[es])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10


<keras.callbacks.History at 0x7f90b171c510>

In [26]:
pred = mod.predict(test)

In [29]:
print(test.shape)
print(pred.shape)
print(pred)
pred = np.around(pred)
pred = pred.astype(np.int64)
pred = pd.DataFrame(pred, columns=['target'])

(3263, 158)
(3263, 1)
      target
0        0.0
1        1.0
2        1.0
3        1.0
4        1.0
...      ...
3258     0.0
3259     1.0
3260     1.0
3261     1.0
3262     1.0

[3263 rows x 1 columns]


In [30]:
df = pd.concat([id, pred], axis=1)
df.to_csv('/content/submission.csv', index=False)
print(testX[:5])
pred.head()

0                   Just happened a terrible car crash
1    Heard about #earthquake is different cities, s...
2    there is a forest fire at spot pond, geese are...
3             Apocalypse lighting. #Spokane #wildfires
4        Typhoon Soudelor kills 28 in China and Taiwan
Name: text, dtype: object


Unnamed: 0,target
0,0
1,1
2,1
3,1
4,1
