In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Installing

In [None]:
# ! pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip
# !pip install tweet-preprocessor
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

# **Library**

In [None]:
import datetime, os, random, re, nltk, tokenization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_hub as hub
sns.set_style('darkgrid')
from pandas_profiling import ProfileReport
from sklearn.model_selection import train_test_split
from tensorflow import keras
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from transformers import TFRobertaForSequenceClassification, RobertaTokenizer
from sklearn.metrics import precision_score, accuracy_score, recall_score, classification_report
from keras.utils import to_categorical

# **Load Data**

In [None]:
train_full = pd.read_csv('/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_train.csv')
test = pd.read_csv('/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_test.csv')

In [None]:
train_full.head()

In [None]:
test.head()

In [None]:
print(f'Development data: {train_full.shape}')
print(f'Unseen data: {test.shape}')

# **Data Understanding**

In [None]:
# profile = ProfileReport(train_full, title = 'Corona Tweets Report', explorative = True)
# display(profile)

Train data has 41157 rows and 6 columns.

Columns explanation:
- UserName: Index of the user
- ScreenName: Index of the user's screen
- Location: country of the user
- TweetAt: Date of the tweets
- OriginialTweet: Contents of the tweets
- Sentiment: Sentiment of the user

Understanding about the data:
- Has two numerical and four categorical
- UserName and ScreenName have unique values.
- Location has high cardinality and missing values.
- TweetAt is highly correlated with UserName and ScreenName
- Sentiment has only 5 unique values which is Positive, Negative, Neutral, Extremely Positive, and Extremely Negative

# **Exploratory Data Analysis**

Since it's EDA, I'll only use train data. First, I'm going to split train full data into 80% train data and 20% validation data.

In [None]:
train, valid = train_test_split(train_full, test_size = 0.2, random_state = 100)

In [None]:
train.head()

In [None]:
valid.head()

In [None]:
print(f'Training data: {train.shape}')
print(f'Validation data: {valid.shape}')

To prevent the train data changed while EDA, I will pass it into a new dataframe.

In [None]:
df = train.copy()

In [None]:
df.head()

In [None]:
df.shape

## UserName and ScreenName Columns

At data understanding, I've mentioned that UserName and ScreenName columns has unique values. If I'm going to plot these, I won't get any insights too. I think it's better to ignore these columns. Also, at data preprocessing, it's better to drop these columns since they're not useful for data analysis.

## Location

Since Location has high cardinality, let's check the unique values.

In [None]:
df['Location'].nunique()

In [None]:
df['Location'].value_counts(dropna = False)[:20]

We could see the values are inconsistent. Let's fix this!

In [None]:
df['Location'] = df['Location'].str.split(',').str[0]
df.loc[df['Location'] == 'UK' ,'Location'] = 'United Kingdom'
df.loc[df['Location'] == 'USA', 'Location'] = 'United States'
df.loc[df['Location'] == 'US', 'Location'] = 'United States'
df.loc[df['Location'] == 'The United States of America', 'Location'] = 'United States'
df.loc[df['Location'] == 'United States of America', 'Location'] = 'United States'
df.loc[df['Location'] == 'America', 'Location'] = 'United States'
df.loc[df['Location'] == 'United States ', 'Location'] = 'United States'

In [None]:
df['Location'].nunique()

In [None]:
df['Location'].value_counts(dropna = False)[:20]

As we could see, the location contains missing values. If I drop the rows that contains missing data, it maybe could affect our EDA so let's pass it into new dataframe rather than take a risk by dropping it!

In [None]:
df2 = df.copy()

In [None]:
print(f'Dataframe with missing values: {df.shape}')
print(f'Dataframe without missing values: {df2.shape}')

It's copied well since both dataframes have the same shape.

In [None]:
df2.dropna(axis = 0, inplace = True)

In [None]:
df2.head()

In [None]:
print(f'Expected rows after dropping: {df.shape[0] - 6910}')
print(f'True rows after dropping: {df2.shape[0]}')

It's dropped well since both dataframes have the same shape.

In [None]:
df2['Location'].value_counts()[:50]

We could see that the location contains countries or cities. So let's plot top 10 countries and top 10 cities in different plots.

In [None]:
city = ['London', 'New York', 'Washington', 'Los Angeles', 'Toronto', 
        'Chicago', 'Sydney', 'San Francisco', 'Melbourne', 'New Delhi']

country = ['United States', 'United Kingdom', 'England', 'India', 'Australia', 
           'Canada', 'Scotland', 'Singapore', 'South Africa']

states = ['Texas', 'Florida', 'California', 'New Jersey']

In [None]:
color = ['#F2B138', '#29AB87', '#C21807', '#0B6623', '#7C0A02']

plt.figure(figsize = (20, 8))
plt.title('Top 10 Cities that has the most tweets', size = 20)
cplot = sns.countplot(x = 'Location', hue = 'Sentiment', data = df2, order = city, palette = color)

for p in cplot.patches:
    cplot.annotate(format(p.get_height(), '.0f'), 
                   (p.get_x() + p.get_width() / 2, p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 9), 
                   textcoords = 'offset points')
    
plt.xlabel('City', fontsize = 18)
plt.ylabel('Count', fontsize = 18)
plt.xticks(fontsize = 14)
plt.legend(prop = {'size': 13})
plt.show()

We can conclude that top 10 cities tend to have more positive sentiment especially San Francisco that has extremely positive sentiment. Only New Delhi that has more neutral sentiment.

In [None]:
color = ['#F2B138', '#29AB87', '#C21807', '#0B6623', '#7C0A02']

plt.figure(figsize = (20, 8))
plt.title('Top 10 Countries that has the most tweets', size = 20)
cplot = sns.countplot(x = 'Location', hue = 'Sentiment', data = df2, order = country, palette = color)

for p in cplot.patches:
    cplot.annotate(format(p.get_height(), '.0f'), 
                   (p.get_x() + p.get_width() / 2, p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 9), 
                   textcoords = 'offset points')
    
plt.xlabel('Country', fontsize = 18)
plt.ylabel('Count', fontsize = 18)
plt.xticks(fontsize = 14)
plt.legend(prop = {'size': 13})
plt.show()

We can conclude that top 9 countries tend to have more positive sentiment. Only India that has more negative sentiment. Besides that, only Canada that has more neutral sentiment.

In [None]:
color = ['#F2B138', '#29AB87', '#C21807', '#0B6623', '#7C0A02']

plt.figure(figsize = (20, 8))
plt.title('Top 4 States that has the most tweets', size = 20)
cplot = sns.countplot(x = 'Location', hue = 'Sentiment', data = df2, order = states, palette = color)

for p in cplot.patches:
    cplot.annotate(format(p.get_height(), '.0f'), 
                   (p.get_x() + p.get_width() / 2, p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 9), 
                   textcoords = 'offset points')
    
plt.xlabel('Country', fontsize = 18)
plt.ylabel('Count', fontsize = 18)
plt.xticks(fontsize = 14)
plt.legend(prop = {'size': 13})
plt.show()

# Sentiment

# Data Preprocessing

In [None]:
def encoded_cat(df):
    df['Labels'] = df['Sentiment'].astype('category').cat.codes
    return df

In [None]:
train_full = encoded_cat(train_full)
train = encoded_cat(train)
valid = encoded_cat(valid)
test = encoded_cat(test)

In [None]:
train_full['Labels'].value_counts()

In [None]:
train['Labels'].value_counts()

In [None]:
valid['Labels'].value_counts()

In [None]:
test['Labels'].value_counts()

In [None]:
X, y = train_full['OriginalTweet'].to_list(), train_full['Labels'].to_list()
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 100)

In [None]:
X_test, y_test = test['OriginalTweet'].to_list(), test['Labels'].to_list()

In [None]:
def convert_to_dfX(df):
    df = pd.DataFrame(df, columns = ['OriginalTweet'])
    return df

def convert_to_dfy(df):
    df = pd.DataFrame(df, columns = ['Labels'])
    return df

In [None]:
df_X_train = convert_to_dfX(X_train)
df_X_valid = convert_to_dfX(X_valid)
df_X_test = convert_to_dfX(X_test)

In [None]:
df_X_train.head()

In [None]:
df_X_valid.head()

In [None]:
df_X_test.head()

In [None]:
df_y_train = convert_to_dfy(y_train)
df_y_valid = convert_to_dfy(y_valid)
df_y_test = convert_to_dfy(y_test)

In [None]:
df_y_train.head()

In [None]:
df_y_valid.head()

In [None]:
df_y_test.head()

In [None]:
print('Training Data')
print(X_train[:15])
print(y_train[:15])
print('\nValidation Data')
print(X_valid[:15])
print(y_valid[:15])
print('\nUnseen Data')
print(X_test[:15])
print(y_test[:15])

In [None]:
stopword = nltk.corpus.stopwords.words("english")

In [None]:
def clean(text):
    #     remove urls
    text = re.sub(r'http\S+', " ", text)
    #     remove mentions
    text = re.sub(r'@\w+',' ',text)
    #     remove hastags
    text = re.sub(r'#\w+', ' ', text)
    #     remove digits
    text = re.sub(r'\d+', ' ', text)
    #     remove html tags
    text = re.sub('r<.*?>',' ', text)
    # Removes symbols
    text = re.sub(r'&[A-Za-z0-9]+', ' ', text)
    
    # Removes uniques characters
    text = re.sub(r'[^a-zA-Z ]',' ', text)
    
    # Remove all extra spaces
    text = re.sub(r'( +)',' ', text)
    text = text.strip()
    
    # Changes characters to lowercase
    text = text.lower()
    
    # remove stop words 
    text = text.split()
    text = " ".join([word for word in text if not word in stopword])
    
    return text

In [None]:
X_train_copy = df_X_train.copy()
X_valid_copy = df_X_valid.copy()
X_test_copy = df_X_test.copy()

In [None]:
X_train_copy.head()

In [None]:
new_clean_X_train = X_train_copy['OriginalTweet'].apply(lambda x: clean(x)).to_list()
new_clean_X_valid = X_valid_copy['OriginalTweet'].apply(lambda x: clean(x)).to_list()
new_clean_X_test = X_test_copy['OriginalTweet'].apply(lambda x: clean(x)).to_list()

In [None]:
print('Training Data')
print(new_clean_X_train[16:20])
print(y_train[:15])
print('\nValidation Data')
print(new_clean_X_valid[16:20])
print(y_valid[:15])
print('\nUnseen Data')
print(new_clean_X_test[16:20])
print(y_test[10:15])

# Baseline Model

In [None]:
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

text_clf.fit(new_clean_X_train, y_train)

In [None]:
y_pred = text_clf.predict(new_clean_X_test)
print(classification_report(y_test, y_pred))

In [None]:
print(f"Accuracy Score: {accuracy_score(y_test, y_pred)}")

# Modelling

In [None]:
MODEL_NAME = 'roberta-base'

tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME)

In [None]:
new_clean_X_train[:5]

In [None]:
label = preprocessing.LabelEncoder()
y_train_categorical = label.fit_transform(train['Sentiment'])
y_train_categorical = to_categorical(y_train_categorical)
print(y_train_categorical[:5])

In [None]:
y_valid_categorical = label.fit_transform(valid['Sentiment'])
y_valid_categorical = to_categorical(y_valid_categorical)
print(y_valid_categorical[:5])

In [None]:
m_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'
bert_layer = hub.KerasLayer(m_url, trainable = True)

In [None]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [None]:
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
        
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len-len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
        
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [None]:
max_len = 250

X_train_full_encoded = bert_encode(new_clean_X_train, tokenizer, max_len)
X_train_encoded = bert_encode(new_clean_X_train, tokenizer, max_len)
X_valid_encoded = bert_encode(new_clean_X_valid, tokenizer, max_len)
X_test_encoded = bert_encode(new_clean_X_test, tokenizer, max_len)

In [None]:
def build_model(bert_layer, max_len=512):
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")
    
    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    
    clf_output = sequence_output[:, 0, :]
    
    lay = tf.keras.layers.Dense(32, activation='relu')(clf_output)
    lay = tf.keras.layers.Dropout(0.2)(lay)
    lay = tf.keras.layers.Dense(16, activation='relu')(lay)
    lay = tf.keras.layers.Dropout(0.2)(lay)
    out = tf.keras.layers.Dense(5, activation='softmax')(lay)
    
    model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(tf.keras.optimizers.Adam(lr=2e-5), loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
model = build_model(bert_layer, max_len=max_len)
model.summary()

In [None]:
checkpoint = tf.keras.callbacks.ModelCheckpoint('my_model.h5', save_best_only = True)
earlystopping = tf.keras.callbacks.EarlyStopping(patience = 10)

history = model.fit(
    X_train_encoded, y_train_categorical,
    validation_data = (X_valid_encoded, y_valid_categorical),
    epochs = 20,
    callbacks = [checkpoint, earlystopping],
    batch_size = 16)

In [None]:
# def convert_sentence_to_features(dataset):
#     return tokenizer(
#         dataset,
#         add_special_tokens = True,
#         return_attention_mask = True, # roberta doesn't need attention mask
#         truncation = True,
#         padding = True)

In [None]:
# X_train_encoded = convert_sentence_to_features(new_clean_X_train)
# X_valid_encoded = convert_sentence_to_features(new_clean_X_valid)
# X_test_encoded = convert_sentence_to_features(new_clean_X_test)

In [None]:
# X_train_encoded.keys()

In [None]:
# len(X_train_encoded['input_ids'])

In [None]:
# len(y_train)

In [None]:
# def build_model(bert_layer, max_len=512):
#     input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
#     input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
#     segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

#     _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
#     clf_output = sequence_output[:, 0, :]
#     out = Dense(1, activation='sigmoid')(clf_output)
    
#     model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
#     model.compile(Adam(lr=2e-6), loss='binary_crossentropy', metrics=['accuracy'])
    
#     return model

In [None]:
# def tensor_slices(X, y):
#     return tf.data.Dataset.from_tensor_slices((dict(X), y))

In [None]:
# batch_size = 32
# shuffle_buffer_size = 17000

# train_encoded = tensor_slices(X_train_encoded, y_train).shuffle(shuffle_buffer_size).batch(batch_size)
# valid_encoded = tensor_slices(X_valid_encoded, y_valid).batch(batch_size)
# test_encoded = tensor_slices(X_test_encoded, y_test).batch(batch_size)

In [None]:
for message, label in train_encoded.take(1):
    print(message, label)

In [None]:
for message, label in valid_encoded.take(1):
    print(message, label)

In [None]:
for message, label in test_encoded.take(1):
    print(message, label)

In [None]:
# def build_roberta_model(learning_rate = 1e-5):
#     roberta_model = TFRobertaForSequenceClassification.from_pretrained(MODEL_NAME, 
#                                                                        num_labels = 5, 
#                                                                        num_hidden_layers = 10)
    
#     optimizer = keras.optimizers.Adam(learning_rate = learning_rate, epsilon = 1e-8)
#     roberta_model.compile(loss = 'sparse_categorical_crossentropy', optimizer = optimizer, metrics = ['accuracy'])
#     return roberta_model

In [None]:
# roberta_model = build_roberta_model()
# roberta_model.summary()

In [None]:
%load_ext tensorboard

logdir = os.path.join('logs', 'my_baseline_model')
tensorboard_cb = tf.keras.callbacks.TensorBoard(logdir, histogram_freq = 1)
checkpoint_cb = keras.callbacks.ModelCheckpoint("my_baseline_model.h5", save_best_only = True)
early_stopping_cb = keras.callbacks.EarlyStopping(patience = 10)

In [None]:
# history = roberta_model.fit(train_encoded, epochs = 25,
#                                   validation_data = valid_encoded,
#                                   callbacks = [checkpoint_cb, early_stopping_cb, tensorboard_cb])

In [None]:
# %tensorboard --logdir logs

In [None]:
# y_pred = text_clf.predict(X_test)
# print(classification_report(y_test, y_pred))

In [None]:
# print(f"Accuracy Score: {accuracy_score(y_test, y_pred)}")
# print(f"Precision Score: {precision_score(y_test, y_pred)}")
# print(f"Recall Score: {recall_score(y_test, y_pred)}")