In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import re
import nltk
import tqdm
from nltk.corpus import stopwords

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers, Sequential
from tensorflow.keras.optimizers import Adam


import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

# Data loading

In [None]:
train_df = pd.read_csv(r'/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_train.csv', encoding='latin_1')
test_df = pd.read_csv(r'/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_test.csv', encoding='latin_1')

In [None]:
train_df.head()

In [None]:
train_df.isnull().sum()

In [None]:
test_df.head()

In [None]:
test_df.info()

In [None]:
test_df.isnull().sum()

In [None]:
train_x, train_y = train_df['OriginalTweet'].copy(), train_df['Sentiment'].copy()
train_x.head()

# 2.Data cleaning

In [None]:
stop = stopwords.words('english')
def clean(text):
        # specific
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can\'t", "can not", text)

    # general
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
# remove url
#         text = re.sub(r'https?://\S+|www\.\S+', ' ', text)
    text = re.sub(r'http\S+', '', text)
# remove @
    text = re.sub(r'@\w+', '', text)
# remove #
    text = re.sub(r'#\w+', '', text)
    
    text = re.sub(r'<.*?>', '', text)
        
    text = text.split()
        
    text = ' '.join([word.lower() for word in text if word.lower() not in stop])
        
    return text
        
        

In [None]:
train_x.head()
train_x_cleaned = train_x.apply(clean)
train_x_cleaned.head()

In [None]:
train_y.value_counts()

In [None]:
def pretrain(sentiment):
    if sentiment == "Positive":
        return 0
    
    elif sentiment == "Extremely Positive":
        return 0
    
    elif sentiment == "Negative":
        return 1
    
    elif sentiment == "Extremely Negative":
        return 1
    
    else:
        return 2
    
train_y = train_y.apply(pretrain)
train_y.value_counts(normalize= True)

# 3.Tokenize

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_x_cleaned)
vocab_size = len(tokenizer.word_index) + 1
print('vocabulary size:{}'.format(vocab_size))
max_len = max(train_x_cleaned.apply(len))
print('max length of input:{}'.format(max_len))
sequence = tokenizer.texts_to_sequences(train_x_cleaned)

x = pad_sequences(sequence, max_len, padding='post')
print('example x:{}'.format(x[4]))

# 4.Model

In [None]:
BATCH_SIZE = 64
EPOCHS = 4
embedding_dim=16
model = Sequential([
    layers.Embedding(vocab_size, embedding_dim, input_length=max_len),
    layers.Bidirectional(layers.GRU(256, return_sequences=True)),
    layers.GlobalMaxPool1D(),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.4),
    layers.Dense(3, activation='sigmoid')
    
])
model.summary()

In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(x, train_y, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=0.1)

In [None]:
acc = history.history['accuracy']
loss = history.history['loss']
val_acc = history.history['val_accuracy']
val_loss = history.history['val_loss']

_, axis = plt.subplots(1,2, figsize=(16,8))
axis[0].plot(range(EPOCHS), loss, label='loss')
axis[0].plot(range(EPOCHS), acc, label='loss')
axis[1].plot(range(EPOCHS), val_loss, label='loss')
axis[1].plot(range(EPOCHS), val_acc, label='loss')
axis[0].legend()
axis[1].legend()
axis[0].set_title('loss & accuracy')
axis[0].set_xlabel('epochs')

axis[1].set_title('val_loss & val_accuracy')
axis[1].set_xlabel('epochs')

plt.show()


# 5.Test results

In [None]:
test_x = test_df['OriginalTweet'].copy()
test_y = test_df['Sentiment'].copy()
test_x = test_x.apply(clean)
test_x = tokenizer.texts_to_sequences(test_x)
test_x = pad_sequences(test_x, max_len, padding='post')
test_y = test_y.apply(pretrain)

In [None]:
y_pred = model.predict(test_x)
y_pred = np.argmax(y_pred, axis=1)
print(y_pred)

In [None]:

cm = confusion_matrix(test_y, y_pred)
cm_df = pd.DataFrame(cm,index=[0,1,2],columns=[0,1,2])
print("Accuracy:",accuracy_score(test_y, y_pred))

sns.set(font_scale=1.4,color_codes=True,palette="deep")
sns.heatmap(cm_df,annot=True,annot_kws={"size":16},fmt="d",cmap="YlGnBu")
plt.title("Confusion Matrix")
plt.xlabel("Predicted Value")
plt.ylabel("True Value")