In [None]:
!pip install datasets transformers[sentencepiece]

Here are some resources that i used
* **[Hugging Face Course](https://huggingface.co/course/chapter0/1?fw=pt)**
* **[Fine-Tunning Pretrained Models](https://towardsdatascience.com/russian-troll-tweets-classification-using-bert-abec09e43558)**
* **[classify tweet with bert](https://www.tensorflow.org/text/tutorials/classify_text_with_bert)**

# Importing Tensorflow and Transformers

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow.keras import layers

import transformers
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, RobertaTokenizer, DistilBertConfig, BertConfig, TFDistilBertModel

# Importing Datasets

In [None]:
train_df = pd.read_csv('../input/nlp-getting-started/train.csv')
test_df = pd.read_csv('../input/nlp-getting-started/test.csv')
sample_submission = pd.read_csv('../input/nlp-getting-started/sample_submission.csv')

In [None]:
train_df.head()

In [None]:
len(train_df)

In [None]:
labels = train_df['target'].unique()
len_labels = len(labels)

labels, len_labels

## Spliting train data into traun and validation datasets

In [None]:
# split train dataset into train, validation and test sets
train_text, val_text, train_labels, val_labels = train_test_split(train_df['text'], train_df['target'], 
                                                                    random_state=101, 
                                                                    test_size=0.2, 
                                                                    stratify=train_df['target'])

test_text = test_df['text']

len(train_text), len(val_text), len(test_text)

## Preprocessing text Data 

* Removing HTML content from tweets
* removing punctuation
* Removing stop words
* Lowercasing all tweets

**You can download my small nlp healper functions script from https://github.com/vishalrk1/pytorch/blob/main/nlp_helper.py**

In [None]:
# Downloading My nlp healper function script
!wget https://raw.githubusercontent.com/vishalrk1/pytorch/main/nlp_helper.py

In [None]:
from nlp_helper import remove_html, remove_punctuation, lowercase_text, word_lemmatizer
import nltk
nltk.download('wordnet')

# Remove punctuation
def remove_punctuation(text):
    punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    no_punct = ''.join([c for c in text if c not in punc])
    no_punct = no_punct.lower()
    return no_punct

def preprocess_text(text):
    text = remove_html(text)
    text = remove_punctuation(text)
      # text = remove_stopwords(text)
      # text = lowercase_text(text)
    text = word_lemmatizer(text)
    text = ''.join(text)
    return text


train_text = train_text.apply(lambda x: preprocess_text(x))
val_text = val_text.apply(lambda x: preprocess_text(x))
test_text = test_text.apply(lambda x: preprocess_text(x))

# Calculating Sequence length

In [None]:
# get length of all the messages in the train set
seq_len = [len(i.split()) for i in train_text]

pd.Series(seq_len).hist(bins = 30)

# Bert Model & tokenizer

In [None]:
import transformers
from transformers import DistilBertTokenizer, RobertaTokenizer
distil_bert = 'distilbert-base-uncased'

# Defining DistilBERT tokonizer
tokenizer = DistilBertTokenizer.from_pretrained(distil_bert, do_lower_case=True, add_special_tokens=True, max_length=30, pad_to_max_length=True)

# downloading model config for model 
config = DistilBertConfig(dropout=0.2, attention_dropout=0.2)
config.output_hidden_states = False

transformer_model = TFDistilBertModel.from_pretrained(distil_bert, config=config)

In [None]:
# tokenizing all sentences and creating iinputs for model
from tqdm.notebook import tqdm 

def tokenize(sentences, tokenizer):
    input_ids, input_masks, input_segments = [],[],[]
    for sentence in tqdm(sentences):
        inputs = tokenizer.encode_plus(sentence, add_special_tokens=True, max_length=30, pad_to_max_length=True, return_attention_mask=True, return_token_type_ids=True)

        input_ids.append(inputs['input_ids'])
        input_masks.append(inputs['attention_mask'])
        input_segments.append(inputs['token_type_ids'])        
        
    return np.asarray(input_ids, dtype='int32'), np.asarray(input_masks, dtype='int32'), np.asarray(input_segments, dtype='int32')

# Creating Tokenized data

In [None]:
print('Input Tokens')
input_train_ids, input_train_masks, input_train_segments = tokenize(train_text.to_list(), tokenizer)

print('Validation Tokens')
input_val_ids, input_val_masks, input_val_segments = tokenize(val_text.to_list(), tokenizer)

print('test Tokens')
input_test_ids, input_test_masks, input_test_segments = tokenize(test_text.to_list(), tokenizer)

# Creating TF Datasets for model Training

In [None]:
train_input = tf.data.Dataset.from_tensor_slices((input_train_ids, input_train_masks))
train_output = tf.data.Dataset.from_tensor_slices(train_labels.to_numpy())
train_dataset = tf.data.Dataset.zip((train_input, train_output))
train_dataset = train_dataset.batch(32).prefetch(tf.data.AUTOTUNE)

train_dataset, len(train_dataset)

In [None]:
val_input = tf.data.Dataset.from_tensor_slices((input_val_ids, input_val_masks))
val_output = tf.data.Dataset.from_tensor_slices(val_labels.to_numpy())
val_dataset = tf.data.Dataset.zip((val_input, val_output))
val_dataset = val_dataset.batch(32).prefetch(tf.data.AUTOTUNE)

val_dataset, len(val_dataset)

In [None]:
test_input = tf.data.Dataset.from_tensor_slices((input_test_ids, input_test_masks))
test_dataset = tf.data.Dataset.zip((test_input))
test_dataset = test_dataset.batch(32).prefetch(tf.data.AUTOTUNE)

test_dataset, len(test_dataset)

# Defining Model

In [None]:
input_ids_in = tf.keras.layers.Input(shape=(30,), name='input_token', dtype='int32')
input_masks_in = tf.keras.layers.Input(shape=(30,), name='masked_token', dtype='int32') 

embedding_layer = transformer_model(input_ids_in, attention_mask=input_masks_in)[0]
X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True))(embedding_layer)
X = tf.keras.layers.Dropout(0.4)(X)
X = tf.keras.layers.LSTM(128, return_sequences=False)(X)
X = tf.keras.layers.Dense(64, activation='relu')(X)
X = tf.keras.layers.Dropout(0.2)(X)
X = tf.keras.layers.Dense(1, activation='sigmoid')(X)

model = tf.keras.Model(inputs=[input_ids_in, input_masks_in], outputs = X)

for layer in model.layers[:3]:
    layer.trainable = False

In [None]:
 model.summary()

# optimizer, Loss function and training Model

In [None]:
model.compile(
    loss = 'binary_crossentropy',
    optimizer = tf.keras.optimizers.Adam(),
    metrics = ['accuracy']
)

history_1 = model.fit(
    train_dataset,
    epochs = 25,
    steps_per_epoch = len(train_dataset),
    validation_data = val_dataset,
    validation_steps = int(0.5 * len(val_dataset)),
    callbacks = [
                 tf.keras.callbacks.EarlyStopping(patience=4, restore_best_weights=True),
                 tf.keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=3),
    ]
)

In [None]:
model.evaluate(val_dataset)

# saving test df predictions to csv file

In [None]:
pred = model.predict([input_test_ids, input_test_masks])
pred = np.squeeze(pred).round()

In [None]:
df = pd.DataFrame()
df['id'] = test_df['id']
df['target'] = pred.astype(int)

df.to_csv('submission.csv', index=False)
df