In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import seaborn as sns
import nltk
import tqdm
import re

In [None]:
df = pd.read_csv("/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_train.csv")

In [None]:
df.head()

In [None]:
df.isna().sum()

In [None]:
tweets = df["OriginalTweet"].values

In [None]:
sentiments = df["Sentiment"].values

In [None]:
sentiments[:50]

In [None]:
stop_words = nltk.corpus.stopwords.words('english')

In [None]:
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()
def standardization(text):
    text=re.sub("[^a-zA-Z]"," ", str(text))
    #remove urls
    text = re.sub(r'http\S+', " ", str(text))
    #remove mentions
    text = re.sub(r'@\w+',' ', str(text))
    #remove hastags
    text = re.sub(r'#\w+', ' ', str(text))
    #remove html tags
    text = re.sub('r<.*?>',' ', str(text))
    #Lowering the tweets
    text=text.lower()
    #Converting into a list
    text=text.split()
    #Removing the Stopwords
    text=[ps.stem(word) for word in text if not word in stop_words]
    #Joining the list
    text=" ".join(text)
    

    return text

In [None]:
X = []
for i in tqdm.tqdm(range(len(tweets))):
    X.append(standardization(str(tweets[i])))

In [None]:
X[10]

In [None]:
type(X)

In [None]:
tokenizer = Tokenizer(num_words = 5000, lower = True, oov_token='<OOV>')
tokenizer.fit_on_texts(X)

In [None]:
X[10]

In [None]:
len(X)

In [None]:
sequences = tokenizer.texts_to_sequences(X)
X = pad_sequences(sequences, maxlen=66)

In [None]:
sequences[4]

In [None]:
X.shape

In [None]:
X[5]

In [None]:
def oneHot(s):
    if s == 'Extremely Negative':
        return 0
    elif s == 'Negative':
        return 1
    elif s == 'Neutral':
        return 2
    elif s == 'Positive':
        return 3
    elif s == 'Extremely Positive':
        return 4
    
y = [oneHot(s) for s in sentiments]

In [None]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
y = df['Sentiment']
y=le.fit_transform(y)

In [None]:
from keras.utils.np_utils import to_categorical
y = to_categorical(y,5)

In [None]:
y

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = np.random.randint(15, 100), test_size = 0.33)

In [None]:
X_train.shape

In [None]:
X_train[0]

In [None]:
from tensorflow.keras import layers
max_features = 10000
embedding_dim = 64

model_1 = tf.keras.Sequential([
    layers.Embedding(max_features, 25),
    layers.LSTM(15,dropout=0.5),
    layers.Dense(5,activation='softmax')
])

model_1.summary()

In [None]:
esc = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', 
                                patience=5, 
                                verbose=0, 
                                mode='auto',
                                restore_best_weights=True)

model_1.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'],              
             )

In [None]:
tf.debugging.set_log_device_placement(True)
with tf.device("GPU:0"):
    history = model_1.fit(tf.convert_to_tensor(X_train),
                        tf.convert_to_tensor(y_train),
                        epochs=50,
                        batch_size=256,
                        validation_data=(X_test, y_test),
                        callbacks=[esc]
                        )

In [None]:
sns.lineplot(data = history.history)