In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('../input/nlp-getting-started/train.csv')
test = pd.read_csv('../input/nlp-getting-started/test.csv')

In [None]:
import re
import string
import matplotlib.pyplot as plt
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModel

In [None]:
print(tf.__version__)

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer; wnl = WordNetLemmatizer()

In [None]:
from wordcloud import WordCloud, STOPWORDS
wordcloud = WordCloud(width = 1600, height = 900, background_color = 'white', stopwords = STOPWORDS).generate(' '.join(train.text.tolist()).lower())
plt.figure(figsize=(12, 7))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [None]:
def clean(text) :
    text = ' '.join([wnl.lemmatize(k) for k in text.split() if k.lower() not in STOPWORDS])
    text = re.sub('@\S+', ' ', str(text).lower()).strip()
    text = re.sub('https?:\/\/\S*', ' ', str(text)).strip()
    text = re.sub('&amp', ' ', str(text)).strip()
    text = re.sub('[^A-Za-z]+', ' ', str(text)).strip()
    text = re.sub('(^|\s+)[a-z]($|\s+)', ' ', str(text)).strip()
    text = re.sub('(^|\s+)rt($|s+)', ' ', str(text)).strip()
    text = re.sub('\s+', ' ', str(text)).strip()
    return text

In [None]:
from tqdm.auto import tqdm; tqdm.pandas()
train['text'] = train['text'].progress_apply(clean)
test['text'] = test['text'].progress_apply(clean)

In [None]:
train

In [None]:
wordcloud = WordCloud(width = 1600, height = 900, background_color = 'white', stopwords = STOPWORDS).generate(' '.join(train.text.tolist()).lower())
plt.figure(figsize=(12, 7))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [None]:
model_name = '../input/huggingface-bert/bert-base-uncased'
MAXLEN = 64
BATCH = 16

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
train_tok = tokenizer.batch_encode_plus(train['text'].tolist(), max_length = MAXLEN,
                                        truncation = True, padding = 'max_length',
                                        add_special_tokens = True, return_tensors = 'np')
y_train = np.array(train.target.values)
dataset = tf.data.Dataset.from_tensor_slices((train_tok['input_ids'], train_tok['attention_mask'],
                                              y_train))
def map_func(input_ids, masks, label) :
    return {
        'input_ids' : input_ids,
        'attention_mask' : masks
    }, label
dataset = dataset.map(map_func)
dataset = dataset.shuffle(2500).batch(batch_size = BATCH, drop_remainder = True)
split = 0.7
size = int((train_tok.input_ids.shape[0] // BATCH) * split)
train_ds = dataset.take(size)
val_ds = dataset.skip(size)

Inspired from [Faressayah's Notebook](https://www.kaggle.com/faressayah/sentiment-model-with-tensorflow-transformers)

In [None]:
model = TFAutoModel.from_pretrained(model_name)
input_ids = tf.keras.layers.Input(shape = (MAXLEN, ), name='input_ids', dtype = 'int32')
att = tf.keras.layers.Input(shape = (MAXLEN, ), name = 'attention_mask', dtype = 'int32')
embed = model(input_ids, attention_mask = att)[0]
embed = embed[:, 0, :]
x = tf.keras.layers.Dense(512, activation = 'relu')(embed)
y = tf.keras.layers.Dense(1, activation = 'sigmoid')(x)
bert_model = tf.keras.Model(inputs = [input_ids, att], outputs = y)
optimizer = tf.keras.optimizers.Adam(lr = 1e-5)
loss = tf.keras.losses.BinaryCrossentropy()
acc = tf.keras.metrics.BinaryAccuracy()
bert_model.compile(optimizer = optimizer, loss = loss, metrics = ['accuracy'])
history = bert_model.fit(train_ds,
                         validation_data = val_ds,
                         epochs = 10,
                         batch_size = BATCH)

In [None]:
acc = history.history['accuracy']
loss = history.history['loss']
val_acc = history.history['val_accuracy']
val_loss = history.history['val_loss']

In [None]:
plt.figure(figsize = (12, 7))
plt.subplot(3, 1, 1)
plt.plot(acc, label = 'train acc')
plt.plot(val_acc, label = 'val acc')
plt.title('Accuracy')
plt.legend()

plt.subplot(3, 1, 2)
plt.plot(loss, label = 'train loss')
plt.plot(val_loss, label = 'val loss')
plt.title('Loss')
plt.legend()
plt.show()

In [None]:
bert_model.evaluate(val_ds)

In [None]:
a = np.array([1, 2, 3, 5, 7 ,9])
np.argmax(a)

In [None]:
tok_test = tokenizer.batch_encode_plus(test.text.tolist(),
                                       max_length = MAXLEN,
                                       truncation = True,
                                       padding = 'max_length',
                                       return_tensors = 'tf')
test_ = {'input_ids' : tok_test['input_ids'], 'attention_mask' : tok_test['attention_mask']}
pred = bert_model.predict(test_)
pred = [i for j in pred for i in j]

In [None]:
pred_int = []
for i in pred :
    if i > 0.55 :
        pred_int.append(1)
    else :
        pred_int.append(0)

In [None]:
sample = pd.read_csv('../input/nlp-getting-started/sample_submission.csv')
sample

In [None]:
sample['target'] = pred_int
sample

In [None]:
sample.to_csv('submission.csv', index = False)

And it was scored 0.8099999