In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from textblob import TextBlob
import tensorflow.keras.backend as K
from wordcloud import WordCloud
import seaborn as sns
import numpy as np
import os, re
from tensorflow.keras.layers import Flatten, Dense, Dropout, GlobalAveragePooling1D
from tensorflow.keras.models import Model
from transformers import *
from tqdm.notebook import tqdm
from sklearn.model_selection import GroupKFold, KFold
import tensorflow_hub as hub
import sentencepiece
# import tokenization

In [None]:
# !wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [None]:
module_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'
bert_layer = hub.KerasLayer(module_url, trainable=True)

In [None]:
train_df = pd.read_csv('../input/nlp-getting-started/train.csv')
test_df = pd.read_csv('../input/nlp-getting-started/test.csv')
train_df.head()

In [None]:
train_df.shape, test_df.shape

In [None]:
train_df.info()

In [None]:
sns.set_style('darkgrid')
sns.catplot(x = "target",data = train_df, kind = "count",height=4,aspect=2)

In [None]:
def cleaned_text(text):
    clean=re.sub("http\S+","",text)
    clean=re.sub("pic.twitter\S+","",clean)
    clean=re.sub("@\S+","",clean)
    clean = re.sub('#', '', clean)
    clean = re.sub('goooooooaaaaaal', 'goal', clean)
    clean = re.sub('SOOOO', 'SO', clean)
    clean = re.sub('LOOOOOOL', 'LOL', clean)
    clean = re.sub('Cooool', 'cool', clean)
    clean = re.sub('|', '', clean)
    clean = re.sub(r'\?{2,}', '? ', clean)
    clean = re.sub(r'\.{2,}', '. ', clean)
    clean = re.sub(r'\!{2,}', '! ', clean)
    clean = re.sub('&amp;', '&', clean)
    clean = re.sub('Comin', 'Coming', clean)
    clean = re.sub('&gt;', '> ', clean)
    clean = re.sub('&lt;', '< ', clean)
    clean = re.sub(r'.:', '', clean)
    clean = re.sub('baaaack', 'back', clean)
    clean = re.sub('RT', '', clean)
    clean = re.sub('\s{2,}', ' ', clean)
    clean = clean.lower()
    return clean
train_df['cleaned_text'] = train_df['text'].apply(cleaned_text)
test_df['cleaned_text'] = test_df['text'].apply(cleaned_text)

In [None]:
train_df.head()

In [None]:
length_of_tweet = [len(x.split()) for x in train_df['cleaned_text']]
print(max(length_of_tweet))

In [None]:
max_len = 40

tokenizer = BertTokenizer.from_pretrained('../input/huggingface-bert/bert-large-uncased/vocab.txt')

In [None]:
def bert_encode(texts, tokenizer, max_len=40):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [None]:
def build_model(bert_layer, max_len=40):
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    net = tf.keras.layers.Dense(64, activation='relu')(clf_output)
    net = tf.keras.layers.Dropout(0.4)(net)
    net = tf.keras.layers.Dense(32, activation='relu')(net)
    net = tf.keras.layers.Dropout(0.4)(net)
    out = tf.keras.layers.Dense(1, activation='sigmoid')(net)
    
    model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(tf.keras.optimizers.Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
max_len = 40
train_input = bert_encode(train_df.cleaned_text.values, tokenizer, max_len=max_len)
test_input = bert_encode(test_df.cleaned_text.values, tokenizer, max_len=max_len)
train_labels = train_df['target']

In [None]:
model = build_model(bert_layer, max_len=max_len)
model.summary()

In [None]:
train_history = model.fit(
    train_input, train_labels, 
    validation_split=0.2,
    epochs=15,
    batch_size=32,
    verbose=1)

In [None]:
predictions = model.predict(test_input)

In [None]:
predictions = predictions.reshape(-1)

In [None]:
df = pd.read_csv('../input/nlp-getting-started/sample_submission.csv')
df.head()

In [None]:
predictions = pd.Series(predictions)
predictions.describe()

In [None]:
sub_df = pd.DataFrame(predictions, columns = ['target'])
target = sub_df['target'].tolist()
# sub_df['column_name'] = numpy.where(condition, new_value, DataFrame.column_name)

In [None]:
target = [0 if i < 0.5 else 1 for i in target]

In [None]:
sub_df['target'] = target
sub_df['id'] = test_df['id']
sub_df = pd.DataFrame(sub_df, columns = ['id', 'target'])
sub_df.head()

In [None]:
sub_df.to_csv('submission.csv', index = False)