In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install tensorflow_hub
!pip install tensorflow_text
# !pip intall bert

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text
# import bert
from tensorflow.keras.models import  Model
from tqdm import tqdm
import numpy as np
from collections import namedtuple
print("TensorFlow Version:",tf.__version__)
print("Hub version: ",hub.__version__)

import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from pylab import rcParams
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
tfhub_handle_encoder = "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1"
tfhub_handle_preprocess = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"

In [None]:
df_train_src = pd.read_csv('../input/covid-19-nlp-text-classification/Corona_NLP_train.csv', index_col='UserName', encoding='ISO-8859-1')
df_test_src = pd.read_csv('../input/covid-19-nlp-text-classification/Corona_NLP_test.csv', index_col='UserName', encoding='ISO-8859-1')

df = pd.concat([df_train_src, df_test_src], sort=False, axis=0, ignore_index=True)
df.drop(['ScreenName', 'TweetAt', 'Location'], axis=1, inplace=True)
df.head()

In [None]:
df.loc[df['Sentiment']=='Extremely Positive', 'Sentiment'] = 'Positive'
df.loc[df['Sentiment']=='Extremely Negative', 'Sentiment'] = 'Negative'

In [None]:
df_train = df[df.index.isin(df_train_src.index)]
df_test = df[df.index.isin(df_test_src.index)]

# df_train = df.loc[df_train_src.index][['Tweet', 'Sentiment']]
# df_test = df.loc[df_test_src.index][['Tweet', 'Sentiment']]

In [None]:
def build_classifier_model():
    
    text_input = tf.keras.layers.Input(
        shape=(), dtype=tf.string, name='text')
    
    preprocessing_layer = hub.KerasLayer(
        tfhub_handle_preprocess, name='preprocessing')
    
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer(
        tfhub_handle_encoder, trainable=True, name='BERT_encoder')
    
    outputs = encoder(encoder_inputs)
    net = outputs['pooled_output']
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(
        3, activation='softmax', name='classifier')(net)
    model = tf.keras.Model(text_input, net)
    
    loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
    metric = tf.metrics.CategoricalAccuracy('accuracy')
    optimizer = tf.keras.optimizers.Adam(
        learning_rate=5e-05, epsilon=1e-08, decay=0.01, clipnorm=1.0)
    model.compile(
        optimizer=optimizer, loss=loss, metrics=metric)
    model.summary()
    return model

In [None]:
classifier_model = build_classifier_model()

In [None]:
train, valid = train_test_split(df_train,train_size=0.7,random_state=0,stratify=df_train['Sentiment'])
y_train, X_train = train['Sentiment'], train.drop(['Sentiment'], axis=1)
y_valid, X_valid = valid['Sentiment'], valid.drop(['Sentiment'], axis=1)
y_train_c = tf.keras.utils.to_categorical(y_train.astype('category').cat.codes.values, num_classes=3)
y_valid_c = tf.keras.utils.to_categorical(y_valid.astype('category').cat.codes.values, num_classes=3)

In [None]:
history = classifier_model.fit(
    x=X_train['OriginalTweet'].values,
    y=y_train_c,
    validation_data=(X_valid['OriginalTweet'].values, y_valid_c),
    epochs=3)

In [None]:
sns.set()
sns.set_context("paper", font_scale=2) 

fig = plt.figure(figsize=(20,10))
ax1 = fig.add_subplot(121)
sns.lineplot(ax=ax1, data=history.history['accuracy'])
sns.lineplot(ax=ax1, data=history.history['val_accuracy'])
ax1.set(
    title=f"Model accuracy",
    xlabel="epoch",
    ylabel="accuracy"
)
ax1.legend(['train', 'test'], loc='upper left')
ax2 = fig.add_subplot(122)
sns.lineplot(ax=ax2, data=history.history['loss'])
sns.lineplot(ax=ax2, data=history.history['val_loss'])
ax2.set(
    title=f"Model loss",
    xlabel="epoch",
    ylabel="loss"
)
ax2.legend(['train', 'test'], loc='upper left')

fig.tight_layout()
plt.show(fig)


In [None]:
y_test_c = tf.keras.utils.to_categorical(
    df_test['Sentiment'].astype('category').cat.codes.values, num_classes=3)

e = classifier_model.evaluate(x=df_test['OriginalTweet'].values, y=y_test_c)

In [None]:
print(f"BERT Accuracy: {e[1]}")
y_proba_bert = classifier_model.predict(df_test['OriginalTweet'].values)
y_pred_bert = np.argmax(y_proba_bert, axis=1)

y_true = df_test['Sentiment'].astype('category').cat.codes.values

In [None]:
#Confusion Matrix
target_names = ['Negative', 'Neutral', 'Positive']
cf_matrix = confusion_matrix(y_true, y_pred_bert)
sns.heatmap(
    cf_matrix/np.sum(cf_matrix),
    annot=True, fmt='.2%', cmap='Blues',
    xticklabels=target_names,
    yticklabels=target_names)

In [None]:
#Classification Report for BERT
print(classification_report(y_true, y_pred_bert, target_names=target_names))