In [None]:
import pandas as pd

training_data = pd.read_csv('../input/covid-19-nlp-text-classification/Corona_NLP_train.csv', encoding='latin1')
testing_data = pd.read_csv('../input/covid-19-nlp-text-classification/Corona_NLP_test.csv', encoding='latin1')

In [None]:
class_names = ['Negative', 'Positive', 'Neutral', 'Extremely Positive', 'Extremely Negative']

In [None]:
training_data.head()

In [None]:
testing_data.head()

In [None]:
training_data.describe()

In [None]:
import tensorflow as tf
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()
training_data['Sentiment']= label_encoder.fit_transform(training_data['Sentiment'])
testing_data['Sentiment']= label_encoder.fit_transform(testing_data['Sentiment'])



def preprocessing(raw):
    tweets = raw.values[:,4]
    sentiments = raw.values[:,5]
    sentiments = tf.keras.utils.to_categorical(sentiments, 5)
    return tweets, sentiments

tweets, sentiments = preprocessing(training_data)
tweets_test, sentiments_test = preprocessing(testing_data)

print(tweets.shape, sentiments.shape)
print(tweets_test.shape, sentiments_test.shape)


    

In [None]:
from sklearn.model_selection import train_test_split


tweets_train, tweets_val, sentiments_train, sentiments_val = train_test_split(tweets, sentiments, test_size=0.25, random_state=10)

In [None]:
vocab_size = 40000
embedding_dim = 32
max_length=280

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=vocab_size,oov_token="<OOV>")
tokenizer.fit_on_texts(tweets_train)




train_sequences = tokenizer.texts_to_sequences(tweets_train)
train_padded = pad_sequences(train_sequences,maxlen=max_length)


val_sequences = tokenizer.texts_to_sequences(tweets_val)
val_padded = pad_sequences(val_sequences,maxlen=max_length)


test_sequences = tokenizer.texts_to_sequences(tweets_test)
test_padded = pad_sequences(test_sequences,maxlen=max_length)

In [None]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
#     tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(32,activation='relu'),
    tf.keras.layers.Dense(5,activation='softmax')
])

In [None]:
model.summary()

In [None]:
learning_rate = 0.0001

In [None]:
from tensorflow.keras.optimizers import Adam

model.compile(
    loss="categorical_crossentropy",
    optimizer=Adam(lr=learning_rate),
    metrics=['accuracy']
)

In [None]:
import tensorflow as tf

In [None]:
print(type(train_padded[0]), type(sentiments_val[0]))

In [None]:
call_back = tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)

In [None]:
history = model.fit(
                  train_padded, sentiments_train,
                  batch_size=32,
                  epochs=50,
                  verbose=1,
                  callbacks = [call_back],
                  validation_data=(val_padded, sentiments_val)
              )

In [None]:
print(history.history)

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(len(acc))
plt.plot(epochs, acc)
plt.plot(epochs, val_acc)
plt.title('Training & validation on accuracy')
plt.figure()

plt.plot(epochs,loss)
plt.plot(epochs,val_loss)
plt.title('Training & validation loss')

In [None]:


score = model.evaluate(test_padded, sentiments_test)
print('Test loss:', score[0])
print('Test accuracy:', score[1])