In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
! nvidia-smi

In [None]:
import warnings
warnings.filterwarnings('ignore')

import re

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize

from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import regularizers
# from tensorflow.keras.callbacks import TensorBoard

In [None]:
train = pd.read_csv('/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_train.csv', encoding='iso-8859-1')
train.head()

In [None]:
test = pd.read_csv('/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_test.csv', encoding='iso-8859-1')
test.head()

In [None]:
X_train = train.OriginalTweet
y_train = train.Sentiment

X_test = test.OriginalTweet
y_test = test.Sentiment

In [None]:
# Preprocessing: removing special characters, turning to lower case, tokenization, stemming,
# removing stop words
corpus = []
stem = PorterStemmer()
for i in X_train:
    new = re.sub('[^a-zA-z]', ' ', i)
    new = new.lower()
    new = word_tokenize(new)
    new = [stem.stem(i) for i in new if i not in stopwords.words('english')]
    new = ' '.join(new)
    corpus.append(new)

In [None]:
# One hot encoding of features for embedding
voc_size = 10000
one_hot_encoded = [one_hot(i, voc_size) for i in corpus]

# Padding to make arrays of equal lengths
sentence_length = 20
padded = pad_sequences(one_hot_encoded, maxlen=sentence_length)

padded

In [None]:
# Splitting into train and test data
X_train_train,X_train_test, y_train_train,y_train_test = \
train_test_split(padded, y_train, train_size=0.7, random_state=100)

In [None]:
# Label encoding for dependent variable

enc = LabelEncoder()

y_train_train = enc.fit_transform(y_train_train)
y_train_train = to_categorical(y_train_train)

y_train_test = enc.transform(y_train_test)
y_train_test = to_categorical(y_train_test)

y_test = enc.transform(y_test)
y_test = to_categorical(y_test)

list(enumerate(enc.classes_))

In [None]:
# Defining the model
model = Sequential()
model.add(Embedding(input_dim=voc_size, output_dim=10, input_length=sentence_length))
model.add(LSTM(5000))
model.add(Dropout(0.25))
model.add(Dense(1000, activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(700, activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(500, activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(500, activation='relu'))
model.add(Dropout(0.25))

reg = regularizers.l2(l2=0.01)
model.add(Dense(5, activation='softmax', kernel_regularizer=reg))

model.compile(optimizer='sgd', loss='categorical_crossentropy', metrics='accuracy')
model.summary()

In [None]:
# Training the model

# tb_callback = TensorBoard(log_dir='/kaggle/input', histogram_freq=1)

model.fit(x=X_train_train, y=y_train_train, batch_size=50, epochs=10,
          validation_data=(X_train_test,y_train_test))

In [None]:
# Feature engineering for test data
corpus_test = []
stem = PorterStemmer()
for i in X_test:
    new = re.sub('[^a-zA-z]', ' ', i)
    new = new.lower()
    new = word_tokenize(new)
    new = [stem.stem(i) for i in new if i not in stopwords.words('english')]
    new = ' '.join(new)
    corpus_test.append(new)
    
    
    
    
voc_size = 10000
one_hot_encoded_test = [one_hot(i, voc_size) for i in corpus_test]

sentence_length = 20
padded_test = pad_sequences(one_hot_encoded_test, maxlen=sentence_length)
X_test = padded_test

In [None]:
# Prediction of test data
preds = [np.argmax(i) for i in model.predict(X_test)]
clf_report = classification_report([np.argmax(i) for i in y_test], preds)
print(clf_report)