In [None]:
!pip install -q -U tensorflow-text
!pip install -q tf-models-official


[K     |████████████████████████████████| 4.4 MB 5.5 MB/s 
[K     |████████████████████████████████| 1.8 MB 4.2 MB/s 
[K     |████████████████████████████████| 90 kB 3.8 MB/s 
[K     |████████████████████████████████| 1.2 MB 21.4 MB/s 
[K     |████████████████████████████████| 636 kB 12.5 MB/s 
[K     |████████████████████████████████| 99 kB 7.5 MB/s 
[K     |████████████████████████████████| 211 kB 10.1 MB/s 
[K     |████████████████████████████████| 1.1 MB 30.4 MB/s 
[K     |████████████████████████████████| 352 kB 37.8 MB/s 
[K     |████████████████████████████████| 37.1 MB 46 kB/s 
[K     |████████████████████████████████| 43 kB 1.3 MB/s 
[?25h  Building wheel for py-cpuinfo (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [None]:
import os
import shutil

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer

import matplotlib.pyplot as plt

In [None]:
import pandas as pd
import numpy as np

In [None]:
df_IN = pd.read_csv('train.csv')

In [None]:
df_IN.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_IN['text'], df_IN['target'], test_size=0.2, random_state=42)

train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))

BATCH_SIZE = 32
SHUFFLE_BUFFER_SIZE = 100

train_ds = train_dataset.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE)
val_ds = test_dataset.batch(BATCH_SIZE)

In [None]:
def build_classifier_model():
  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
  preprocessing_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3", name='preprocessing')
  encoder_inputs = preprocessing_layer(text_input)
  encoder = hub.KerasLayer('https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1', trainable=True, name='BERT_encoder')
  outputs = encoder(encoder_inputs)
  net = outputs['pooled_output']
  net = tf.keras.layers.Dropout(0.1)(net)
  net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)
  return tf.keras.Model(text_input, net)

In [None]:
from tensorflow import keras


In [None]:
classifier_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
text (InputLayer)               [(None,)]            0                                            
__________________________________________________________________________________________________
preprocessing (KerasLayer)      {'input_type_ids': ( 0           text[0][0]                       
__________________________________________________________________________________________________
BERT_encoder (KerasLayer)       {'pooled_output': (N 28763649    preprocessing[0][0]              
                                                                 preprocessing[0][1]              
                                                                 preprocessing[0][2]              
______________________________________________________________________________________________

In [None]:
classifier_model = build_classifier_model()

In [None]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = tf.metrics.BinaryAccuracy()

In [None]:
epochs = 5
steps_per_epoch = tf.data.experimental.cardinality(train_ds).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

epsi = 3e-5
optimizer = optimization.create_optimizer(init_lr=epsi,
                                          num_warmup_steps=num_warmup_steps,
                                          num_train_steps=num_train_steps,
                                          optimizer_type='adamw')

In [None]:
classifier_model.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)

In [None]:
# print(f'Training model with {tfhub_handle_encoder}')
history = classifier_model.fit(train_ds,
                               validation_data=val_ds,
                               epochs=epochs)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
df_test = tf.data.experimental.make_csv_dataset(
      'test.csv',
      batch_size=32,
      select_columns = ['text'],

     )

In [None]:
test = pd.read_csv('test.csv')

In [None]:
test_l = list(test['text'])

In [None]:
test_l[:10]

In [None]:
pr = classifier_model.predict(test_l)

In [None]:
len(pr)

3263

In [None]:
pr_sigm = tf.math.sigmoid(
    pr
)

In [None]:
pred = lambda x: [1 if i > 0.5 else 0 for i in x]
pr_sigm

<tf.Tensor: shape=(3263, 1), dtype=float32, numpy=
array([[0.89383745],
       [0.9963296 ],
       [0.9852319 ],
       ...,
       [0.98236084],
       [0.8022641 ],
       [0.9185126 ]], dtype=float32)>

In [None]:
pred_f = pred(pr_sigm)

In [None]:
pred_f[:10]

[1, 1, 1, 1, 1, 1, 0, 0, 0, 0]

In [None]:
sub = pd.DataFrame()

In [None]:
test.columns

Index(['id', 'keyword', 'location', 'text'], dtype='object')

In [None]:
sub['id'] = test['id']

In [None]:
sub['target'] = pred_f

In [None]:
sub.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1


In [None]:
sub.to_csv('submission.csv',index=False)