In [None]:
!pip install anyascii
!pip install contractions

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from anyascii import anyascii
from contractions import fix

import re
import nltk
from nltk.tokenize import TweetTokenizer, word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix

import tensorflow as tf
from tensorflow import keras

from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import TFBertModel


import warnings
warnings.filterwarnings('ignore')

np.random.seed(1)
tf.random.set_seed(1)

In [None]:
train_df = pd.read_csv("../input/nlp-getting-started/train.csv")
train_df.head()

In [None]:
train_df = train_df[['text', 'target']]
train_df.head()

In [None]:
train_df.isna().sum()

In [None]:
X, y = train_df.text.to_list(), train_df.target.values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1, stratify=y)

## Cleaning Texts

In [None]:
def clean_text(text):
    
    # remove url links
    pattern = r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
    text = re.sub(pattern, " ", text).strip() 
    
    # remove tweeter handles
    pattern = r"@[\w_]+"
    text = re.sub(pattern, " ", text).strip()
    
    # replace flight codes
    pattern = r"[a-zA-Z]{2,}[0-9]+:?"
    text = re.sub(pattern, " ", text).strip()
    
    # remove distance and times
    pattern = r"[0-9]+[a-zA-Z]{1,}"
    text = re.sub(pattern, " ", text).strip()

    # remove money and time
    pattern = r"\$?[0-9]+:?"
    text = re.sub(pattern, " ", text).strip()
    
    # remove ... or --
    pattern = r"[.]{2,}|[-]{2,}"
    text = re.sub(pattern, " ", text).strip()
    
    # remove special handles/??/| patterns
    pattern = r"@[\w']+|(\|)|[?]{2,}|(\\n){1,}"
    text = re.sub(pattern, " ", text).strip()
    
    # convert in ascii
    text = anyascii(text)
    
    text = text.replace('\n',' ').replace('U/',' ').replace('U_',' ').strip()

    # remove ampersand words
    pattern = r"&[\w]+;?"
    text = re.sub(pattern, " ", text).strip()
    
    # fix contractions
    text = fix(text)

    return text

In [None]:
X_train_cleaned = [clean_text(text) for text in X_train]
X_test_cleaned = [clean_text(text) for text in X_test]

In [None]:
max_len = 70

bert_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(bert_name, 
                                          add_special_tokens=True, 
                                          do_lower_case=True, 
                                          max_length=max_len, 
                                          pad_to_max_length=True)

In [None]:
def bert_encoder(text):
#     text = text.numpy().decode('utf-8')
    encoded = tokenizer.encode_plus(text, add_special_tokens=True,
                                    max_length=max_len,
                                    padding='max_length',
                                    truncation=True,
                                    return_attention_mask=True,
                                    return_token_type_ids=True)
    
    return encoded['input_ids'], encoded['token_type_ids'], encoded['attention_mask']

In [None]:
train_bert = np.array([bert_encoder(text) for text in X_train_cleaned])
train_label = np.array(y_train)
train_label = keras.utils.to_categorical(train_label, num_classes=2).astype('int64')

test_bert = np.array([bert_encoder(text) for text in X_test_cleaned])
test_label = np.array(y_test)
test_label = keras.utils.to_categorical(test_label, num_classes=2).astype('int64')

In [None]:
train_bert.shape

In [None]:
train_text, train_seg, train_masks = np.split(train_bert, 3, axis=1)
test_text, test_seg, test_masks = np.split(test_bert, 3, axis=1)

train_text = train_text.squeeze()
train_seg = train_seg.squeeze()
train_masks = train_masks.squeeze()

test_text = test_text.squeeze()
test_seg = test_seg.squeeze()
test_masks = test_masks.squeeze()

In [None]:
def map_features(input_ids, attention_masks, token_type_ids, y):
    return {'input_ids':input_ids, 'attention_masks':attention_masks, 'token_type_ids':token_type_ids}, y

In [None]:
train_ds = tf.data.Dataset.from_tensor_slices((train_text, train_masks, train_seg, train_label))
train_ds = train_ds.map(map_features).shuffle(100).batch(32)

test_ds = tf.data.Dataset.from_tensor_slices((test_text, test_masks, test_seg, test_label))
test_ds = test_ds.map(map_features).batch(32)

## Using Pretrained Model

In [None]:
tf.random.set_seed(1)

bert_model = TFBertForSequenceClassification.from_pretrained(bert_name)

optimizer = keras.optimizers.Adam(learning_rate=2e-5)
loss = keras.losses.BinaryCrossentropy(from_logits=True)

bert_model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

bert_model.summary()

In [None]:
history = bert_model.fit(train_ds, epochs=2, validation_data=test_ds)

### Get Predictions

In [None]:
sub_df = pd.read_csv("../input/nlp-getting-started/test.csv", usecols=['id', 'text'])
# sub_df

sub_texts = sub_df.text.to_list()

sub_texts_cleaned = [clean_text(text) for text in sub_texts]
sub_texts_seqs = np.array([bert_encoder(text) for text in sub_texts_cleaned])

# sub_texts_seqs.shape

sub_txt, sub_seg, sub_masks = np.split(sub_texts_seqs, 3, axis=1)

sub_txt = sub_txt.squeeze()
sub_masks = sub_masks.squeeze()
sub_seg = sub_seg.squeeze()

In [None]:
predictions = bert_model.predict((sub_txt, sub_masks, sub_seg))
predictions = predictions.logits.argmax(axis=1)
predictions.shape

In [None]:
# save predictions in csv file

sub_df['target'] = predictions
sub_df.drop('text', 1, inplace=True)
sub_df.to_csv("submission.csv", index=False)