In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Extracting data and preprocessing :**

In [None]:
train_ds = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
train_ds

In [None]:
test_ds = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
test_ds

In [None]:
train_ds['target'].value_counts()

The distribution of classes is optimum. No class imbalance found. If found, handle accordingly.

In [None]:
train_ds.isnull().sum()

Here, we observe keyword and location variables contain null values. Since, these are not important columns we will be dropping them along with id:

In [None]:
train_ds = train_ds.drop(['id','keyword','location'],axis=1)
train_ds

In [None]:
test_ds = test_ds.drop(['id','keyword','location'],axis=1)
test_ds

Generally, we employ the following steps while preprocessing texts:
<ol>
    <li>Tokenising the string</li>
    <li>Converting characters to lowercase</li>
    <li>Removing stop words and punctuations</li>
    <li>Stemming or lemmatization</li>
</ol>

In [None]:
import re                                  
import string  
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer  

In [None]:
sent = []
for sentence in train_ds['text']:
    sent_formatted = re.sub(r'https?:\/\/.*[\r\n]*', '', sentence) #Removes hyperlinks
    sent_formatted = re.sub(r'#', '', sent_formatted) #Removes hastags
    sent_formatted = re.sub(r'[0-9]', '', sent_formatted) #Removes numbers
    sent_formatted = re.sub(r'@[A-Za-z]*', '', sent_formatted) #Removes @ tags
    sent.append(sent_formatted) 

In [None]:
sentence = sent[100]
print(sentence)

In [None]:
tokenized_sent = []

tokenizer = TweetTokenizer(preserve_case=False, 
                           strip_handles=True,
                           reduce_len=True)

for sentence in sent:
    tokenized_sentence = tokenizer.tokenize(sentence)
    tokenized_sent.append(tokenized_sentence)

In [None]:
sentence = tokenized_sent[100]
print(sentence)

In [None]:
stopwords_english = stopwords.words('english') 

print('Stop words in english : \n')
print(stopwords_english)

print('\nPunctuations : \n')
print(string.punctuation)

In [None]:
formatted_sent = []
for sentence in tokenized_sent:
    formatted_words = []
    for word in sentence:
        if word not in stopwords_english and word not in string.punctuation and len(word)>2:  #Removes word with less than 2 characters, present in english stop words or is a punctuation
            formatted_words.append(word)
    formatted_sent.append(formatted_words)

In [None]:
sentence = formatted_sent[100]
print(sentence)

In [None]:
lemma_sent = []

lemma = WordNetLemmatizer()

for sentence in formatted_sent:
    lemma_words = []
    for word in sentence:
        lemma_word = lemma.lemmatize(word)
        lemma_words.append(lemma_word)
    lemma_sent.append(lemma_words)

In [None]:
sentence = lemma_sent[100]
print(sentence)

In [None]:
final_sentence_list = []
for sentence in lemma_sent:
    sent = ' '.join([str(word) for word in sentence])
    final_sentence_list.append(sent)

In [None]:
sentence = final_sentence_list[100]
print(sentence)

In [None]:
train_ds['FormattedText'] = final_sentence_list

In [None]:
train_ds

In [None]:
train_ds = train_ds.drop(['text'],axis = 1)
train_ds.rename(columns = {'FormattedText':'text'},inplace = True)
train_ds

# 2. Converting the text to a numerical vector format using tensorflow TextVectorizer:

In [None]:
X_train = train_ds['text']
y_train = train_ds['target']


In [None]:
X_train_array = X_train.to_numpy()
y_train_array = y_train.to_numpy()


In [None]:
X_train_array

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import losses
tf.config.run_functions_eagerly(True)

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((X_train_array, y_train_array))

for text,label in train_dataset.take(1):
    print('Text: ', text.numpy())
    print('Label: ', label.numpy())

In [None]:
# test_dataset = tf.data.Dataset.from_tensor_slices((X_test_array))

# for test_text in test_dataset.take(1):
#     print('Text: ', test_text.numpy())

In [None]:
BUFFER_SIZE = 4000
BATCH_SIZE = 64

In [None]:
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
# test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

# 3. Model Building:

In [None]:
VOCAB_SIZE = 12000


#This layer will only be used in LSTM and GRU architectures for obtaining numerical vector representation of words. 
#For BERT we will use bert spcific vectorization technique.

encoder = tf.keras.layers.TextVectorization(max_tokens=VOCAB_SIZE)
encoder.adapt(train_dataset.map(lambda text, target: text))

In [None]:
vocabulary = np.array(encoder.get_vocabulary())
vocabulary[10:20]

In [None]:
print("Original Text :" +str(text))
encoded_text = encoder(text).numpy()
print("Numeric Represenation :" +str(encoded_text))


# 3.1 Text classification with LSTM:

In [None]:
model = tf.keras.Sequential([
    encoder,
    
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=16,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16,return_sequences=True)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.GlobalMaxPool1D(),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(8, activation='relu'),
    tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Dense(1)
])

model.summary()

In [None]:
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.25, patience=2, min_lr=0.001)

In [None]:
model.compile(loss=losses.BinaryCrossentropy(from_logits=True),
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
epochs = 5
history = model.fit(train_dataset,epochs=epochs,callbacks = [reduce_lr])

# 3.2 Text classification with stacked LSTMs :

In [None]:
stacked_model = tf.keras.Sequential([
    encoder,
    
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=16,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16,return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(8,return_sequences=True)),
    
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.GlobalMaxPool1D(),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(8, activation='relu'),
    tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Dense(1)
])

stacked_model.summary()

In [None]:
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.25, patience=2, min_lr=0.001)

In [None]:
stacked_model.compile(loss=losses.BinaryCrossentropy(from_logits=True),
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
epochs = 5
stacked_history = stacked_model.fit(train_dataset,epochs=epochs,callbacks = [reduce_lr])

# 3.3 Text classification with GRUs :

In [None]:
gru_model = tf.keras.Sequential([
    encoder,
    
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=16,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(16,return_sequences=True)),
    
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.GlobalMaxPool1D(),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(8, activation='relu'),
    tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Dense(1)
])

gru_model.summary()

In [None]:
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.25, patience=2, min_lr=0.001)

In [None]:
gru_model.compile(loss=losses.BinaryCrossentropy(from_logits=True),
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
epochs = 5
gru_history = gru_model.fit(train_dataset,epochs=epochs,callbacks = [reduce_lr])

# 3.4 Text Classification with BERT (Transformer Model):

In [None]:
#pip install tensorflow-text

In [None]:
#pip install tf-models-official

In [None]:
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer

In [None]:
bert_model_name = 'small_bert/bert_en_uncased_L-4_H-512_A-8'

#Note: You can get these bert model and tfhub details on the tensorflow classify text with BERT page

tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1'
tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

In [None]:
def classifier_model():
    
    #Pretrained BERT 
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='input')
    preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='vectorizing')
    encoder_inputs = preprocessing_layer(text_input)
    bert = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT')
    outputs = bert(encoder_inputs)
    
    #Our own custom classification network
    custom = outputs['pooled_output']
    custom = tf.keras.layers.Dropout(0.1)(custom)
    classifier = tf.keras.layers.Dense(1, activation=None, name='classifier')(custom)
    
    return tf.keras.Model(text_input, classifier)

In [None]:
bert_model = classifier_model()

In [None]:
bert_model.summary()

In [None]:
epochs = 5
steps_per_epoch = tf.data.experimental.cardinality(train_dataset).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

#You can also try to use Adam optimizer. But when it comes to transformer based models, it is best to fine-tune them using the same parameters as their pretraining.

In [None]:
bert_model.compile(loss=losses.BinaryCrossentropy(from_logits=True),
              optimizer=optimizer,
              metrics=['accuracy'])

In [None]:
bert_history = bert_model.fit(train_dataset,epochs=epochs)

# 4. Preparing test data for submission:

In [None]:
submission_ds = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")
submission_ds

In [None]:
sent = []
for sentence in test_ds['text']:
    sent_formatted = re.sub(r'https?:\/\/.*[\r\n]*', '', sentence)
    sent_formatted = re.sub(r'#', '', sent_formatted)
    sent_formatted = re.sub(r'[0-9]', '', sent_formatted)
    sent_formatted = re.sub(r'@[A-Za-z]*', '', sent_formatted)
    sent.append(sent_formatted) 

In [None]:
tokenized_sent = []

tokenizer = TweetTokenizer(preserve_case=False, 
                           strip_handles=True,
                           reduce_len=True)

for sentence in sent:
    tokenized_sentence = tokenizer.tokenize(sentence)
    tokenized_sent.append(tokenized_sentence)

In [None]:
formatted_sent = []
for sentence in tokenized_sent:
    formatted_words = []
    for word in sentence:
        if word not in stopwords_english and word not in string.punctuation and len(word)>2:
            formatted_words.append(word)
    formatted_sent.append(formatted_words)

In [None]:
lemma_sent = []

lemma = WordNetLemmatizer()

for sentence in formatted_sent:
    lemma_words = []
    for word in sentence:
        lemma_word = lemma.lemmatize(word)
        lemma_words.append(lemma_word)
    lemma_sent.append(lemma_words)

In [None]:
final_sentence_list = []
for sentence in lemma_sent:
    sent = ' '.join([str(word) for word in sentence])
    final_sentence_list.append(sent)

In [None]:
test_ds['text'] = final_sentence_list
test_ds

In [None]:
X_test = test_ds['text']
X_test_array = X_test.to_numpy()

In [None]:
X_test_array

In [None]:
test_dataset = tf.data.Dataset.from_tensor_slices((X_test_array))

for test_text in test_dataset.take(2):
    print('Text: ', test_text.numpy())

In [None]:
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [None]:
# encoded_text = []

# for test_text in test_dataset:
#     encoded_text.append(encoder(test_text).numpy())

# # encoded_text


In [None]:
y_pred = bert_model.predict(test_dataset)

In [None]:
result = []
for i in y_pred:
    if i >= 0:
        result.append(1)
    else:
        result.append(0)

In [None]:
submission_ds['target'] = result
submission_ds

In [None]:
submission_ds['target'].value_counts()

In [None]:
submission_ds.to_csv('submission.csv', index=False)

# 5. Please upvote this notebook if you find it helpful.

References: 
https://www.tensorflow.org/text/tutorials/classify_text_with_bert