In [1]:
import tensorflow as tf
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import string

In [2]:
path_train = "../input/covid-19-nlp-text-classification/Corona_NLP_train.csv"
path_test = "../input/covid-19-nlp-text-classification/Corona_NLP_test.csv"
df_train = pd.read_csv(path_train, encoding = 'latin1')
df_test = pd.read_csv(path_test, encoding = 'latin1')

In [3]:
df_train.drop(['UserName', 'ScreenName', 'Location', 'TweetAt'], axis = 1, inplace = True)
df_test.drop(['UserName', 'ScreenName', 'Location', 'TweetAt'], axis = 1, inplace = True)
df_train = df_train[(df_train.Sentiment == 'Extremely Positive') | (df_train.Sentiment == 'Positive') | (df_train.Sentiment == 'Extremely Negative') | (df_train.Sentiment == 'Negative')]
df_test = df_test[(df_test.Sentiment == 'Extremely Positive') | (df_test.Sentiment == 'Positive') | (df_test.Sentiment == 'Extremely Negative') | (df_test.Sentiment == 'Negative')]

In [4]:
def makediglabel(text):
    if text in ['Extremely Positive', 'Positive']:
        return 0.0
    elif text in ['Extremely Negative', 'Negative']:
        return 1.0
    else:
        return -1.0

In [5]:
stop_words = stopwords.words("english")
def clean_text(text):
    text = text.lower()
    text = re.sub("@\S+", " ", text)  
    text = re.sub("https*\S+", " ", text)
    text = re.sub("www\S+", " ", text)
    text = re.sub("#\S+", " ", text)
    text = re.sub("\d", " ", text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('\n', ' ', text)        
    text = re.sub('\s{2,}',' ', text)
    text = ' '.join([word for word in text.split(' ') if word not in stop_words])
    return text

In [6]:
df_train.OriginalTweet = df_train.OriginalTweet.apply(clean_text)
df_test.OriginalTweet = df_test.OriginalTweet.apply(clean_text)

In [7]:
df_train.Sentiment = df_train.Sentiment.apply(makediglabel)
df_test.Sentiment = df_test.Sentiment.apply(makediglabel)

In [8]:
X_train = df_train.OriginalTweet
Y_train = df_train.Sentiment
X_test = df_test.OriginalTweet
Y_test = df_test.Sentiment

In [9]:
whole_text = ""
for i in X_train:
    whole_text += i + ' '
for i in X_test:
    whole_text += i + ' '
tokens = word_tokenize(whole_text)
lemmatizer = WordNetLemmatizer()
lemtok = [lemmatizer.lemmatize(t) for t in tokens]
dictionary = list(set(lemtok))

In [10]:
def convert(text):
    tok = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    lem = [lemmatizer.lemmatize(t) for t in tok]
    res = [dictionary.index(i) for i in lem]
    return res

In [11]:
y_train = np.asarray(Y_train).astype("float32")
y_test = np.asarray(Y_test).astype("float32")

In [12]:
X_train = X_train.apply(convert)
X_test = X_test.apply(convert)

In [13]:
max_seq_len = 80
max_features = 30000

In [14]:
x_train = keras.preprocessing.sequence.pad_sequences(X_train, maxlen = max_seq_len)
x_test = keras.preprocessing.sequence.pad_sequences(X_test, maxlen = max_seq_len)

In [15]:
emb_dim = 128

model = keras.Sequential()
model.add(layers.Embedding(max_features, emb_dim))
model.add(layers.LSTM(128))
model.add(layers.Dense(1, activation = 'sigmoid'))

2022-01-28 13:26:16.373949: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-28 13:26:16.476534: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-28 13:26:16.477249: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-28 13:26:16.478450: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

In [16]:
model.compile(optimizer = "adam",
              loss = "binary_crossentropy",
              metrics = ["accuracy"])

history = model.fit(x_train,
                    y_train,
                    epochs = 5,
                    batch_size = 128,
                    validation_split = 0.2,
                    shuffle = True,
                    verbose = 1)

2022-01-28 13:26:19.076679: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/5


2022-01-28 13:26:21.333153: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [17]:
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 128)         3840000   
_________________________________________________________________
lstm (LSTM)                  (None, 128)               131584    
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
Total params: 3,971,713
Trainable params: 3,971,713
Non-trainable params: 0
_________________________________________________________________
None


In [18]:
score = model.evaluate(x_test, y_test, verbose=0) 
print('Test score:', score[0]) 
print('Test accuracy:', score[1])

Test score: 0.5504116415977478
Test accuracy: 0.8373702168464661


In [19]:
#fully connected nn from previous task has got test accuracy circa 0.86
#LSTM suc... there's a room for tunning parameters