In [23]:
import tensorflow as tf
import numpy as np
import pandas as pd
from transformers import AutoTokenizer
from transformers import TFAutoModel

In [24]:
path_train = "../input/covid-19-nlp-text-classification/Corona_NLP_train.csv"
df_train = pd.read_csv(path_train, encoding = 'latin1')

In [25]:
df_train.drop(['UserName', 'ScreenName', 'Location', 'TweetAt'], axis = 1, inplace = True)
df_train = df_train[(df_train.Sentiment == 'Extremely Positive') | (df_train.Sentiment == 'Positive') | (df_train.Sentiment == 'Extremely Negative') | (df_train.Sentiment == 'Negative')]

In [26]:
def makediglabel(text):
    if text in ['Extremely Positive', 'Positive']:
        return 0.0
    elif text in ['Extremely Negative', 'Negative']:
        return 1.0
    else:
        return -1.0

In [27]:
df_train.Sentiment = df_train.Sentiment.apply(makediglabel)

In [28]:
X_train = df_train.OriginalTweet
Y_train = df_train.Sentiment

In [29]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

In [None]:
def tokenize(sequence):
    tokens = tokenizer.encode_plus(sequence, max_length=512,
                                   truncation=True, padding='max_length',
                                   add_special_tokens=True, return_token_type_ids=False,
                                   return_tensors='tf')
    return tokens['input_ids'], tokens['attention_mask']

In [None]:
num_of_elements = 5000

Xids = np.zeros((num_of_elements, 512))
Xmask = np.zeros((num_of_elements, 512))

idx = np.random.randint(0, 33444, num_of_elements)  
small_dataset = np.array(X_train)[idx.astype(int)]
small_dataset_labels = np.array(Y_train)[idx.astype(int)]
labels = np.array(small_dataset_labels)

In [None]:
for i, sequence in enumerate(small_dataset):
    tokens = tokenize(sequence)
    Xids[i, :], Xmask[i, :] = tokens[0], tokens[1]

In [None]:
bert = TFAutoModel.from_pretrained('bert-base-cased')

In [None]:
input_ids = tf.keras.layers.Input(shape=(512,), name='input_ids', dtype='int32')
mask = tf.keras.layers.Input(shape=(512,), name='attention_mask', dtype='int32')
embeddings = bert.bert(input_ids, attention_mask=mask)[1]
x = tf.keras.layers.Dense(1024, activation ='relu')(embeddings)
y = tf.keras.layers.Dense(1, activation ='sigmoid', name='outputs')(x)
model = tf.keras.Model(inputs=[input_ids, mask], outputs=y)
model.layers[2].trainable = False

In [None]:
bert.bert(input_ids, attention_mask=mask)

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=["accuracy"])
history = model.fit(
    [Xids, Xmask], labels,
    validation_split=0.8,
    batch_size = 16,
    verbose = 1,
    epochs=1)