In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train=pd.read_csv('/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_train.csv')
train.head()

In [None]:
test=pd.read_csv('/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_test.csv')
test.head()

In [None]:
train.columns

In [None]:
train['Sentiment'].unique()

In [None]:
res=train['Sentiment'].value_counts(normalize=True)
res

In [None]:
plt.figure(figsize=(6,4),dpi=100)
ax=sns.barplot(x=res.index,y=res.values)
plt.xticks(rotation=45);


In [None]:
! pip install Unidecode
! pip install wordninja

In [None]:
import re
import unidecode
import wordninja
import nltk
import string

In [None]:
def clean_text(text):
  # lower-case all characters
    text=text.lower()
    # remove twitter handles
    text= re.sub(r'@\S+', '',text) 
    # remove urls
    text= re.sub(r'http\S+', '',text) 
    text= re.sub(r'pic.\S+', '',text) 
    # replace unidecode characters
    text=unidecode.unidecode(text) 
    # regex only keeps characters
    text= re.sub(r"[^a-zA-Z+']", ' ',text)
    # keep words with length>1 only
    text=re.sub(r'\s+[a-zA-Z]\s+', ' ', text+' ') 
    text="".join([i for i in text if i not in string.punctuation])
    words=nltk.tokenize.word_tokenize(text)
    stopwords = nltk.corpus.stopwords.words('english')
    text =" ".join([i for i in words if i not in stopwords and len(i)>2])
    # split words like 'whatisthis' to 'what is this'
    def preprocess_wordninja(sentence):      
        def split_words(x):
            x=wordninja.split(x)
            x= [word for word in x if len(word)>1]
            return x
        new_sentence=[ ' '.join(split_words(word)) for word in sentence.split() ]
        return ' '.join(new_sentence)
    text=preprocess_wordninja(text)
    # regex removes repeated spaces, strip removes leading and trailing spaces
    text= re.sub("\s[\s]+", " ",text).strip()  
    return text

In [None]:
train["OriginalTweet"]=train["OriginalTweet"].apply(clean_text)

In [None]:
train.iloc[1]['OriginalTweet']

In [None]:
test["OriginalTweet"]=test["OriginalTweet"].apply(clean_text)

In [None]:
def change_sen(sentiment):
    if sentiment == "Extremely Positive":
        return 'positive'
    elif sentiment == "Extremely Negative":
        return 'negative'
    elif sentiment == "Positive":
        return 'positive'
    elif sentiment == "Negative":
        return 'negative'
    else:
        return 'netural'

In [None]:
train['Sentiment']=train['Sentiment'].apply(lambda x:change_sen(x))
test['Sentiment']=test['Sentiment'].apply(lambda x:change_sen(x))

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le=LabelEncoder()
train['Sentiment']=le.fit_transform(train['Sentiment'])

In [None]:
test['Sentiment']=le.fit_transform(test['Sentiment'])

In [None]:
train['Sentiment'].nunique()

In [None]:
X_train = train['OriginalTweet']
y_train=train["Sentiment"]

In [None]:
X_test = test['OriginalTweet']
y_test=test["Sentiment"]

In [None]:
# from sklearn.model_selection import train_test_split

In [None]:
#  X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding,LSTM
from tensorflow.keras.models import Model

In [None]:
 #Convert sentences to sequences
MAX_VOCAB_SIZE = 30000
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(X_train)
sequences_train = tokenizer.texts_to_sequences(X_train)
# sequences_valid = tokenizer.texts_to_sequences(X_valid)
sequences_test = tokenizer.texts_to_sequences(X_test)


In [None]:
# get word -> integer mapping
word2idx = tokenizer.word_index
V = len(word2idx)
print('Found %s unique tokens.' % V)


In [None]:
# pad sequences so that we get a N x T matrix
data_train = pad_sequences(sequences_train)
print('Shape of data train tensor:', data_train.shape)

# get sequence length
T = data_train.shape[1]

In [None]:
# data_valid = pad_sequences(sequences_valid, maxlen=T)
# print('Shape of data test tensor:', data_valid.shape)

In [None]:
data_test = pad_sequences(sequences_test, maxlen=T)
print('Shape of data test tensor:', data_test.shape)

In [None]:
# Create the model

# We get to choose embedding dimensionality
D = 16

# Hidden state dimensionality
M = 32

# Note: we actually want to the size of the embedding to (V + 1) x D,
# because the first index starts from 1 and not 0.
# Thus, if the final index of the embedding matrix is V,
# then it actually must have size V + 1.

i = Input(shape=(T,))
x = Embedding(V + 1, D)(i)
x = LSTM(M, return_sequences=True)(x)
x = GlobalMaxPooling1D()(x)
x = Dense(64,activation='relu')(x)
x = Dense(3, activation='softmax')(x)

model = Model(i, x)

In [None]:
model.compile(
  loss='sparse_categorical_crossentropy',
  optimizer='adam',
  metrics=['accuracy']
)


print('Training model...')
r = model.fit(
  data_train,
  y_train,
  epochs=3,
  validation_data=(data_test, y_test)
)

In [None]:
plt.plot(r.history['loss'], label='loss')
plt.plot(r.history['val_loss'], label='val_loss')
plt.legend()

In [None]:
plt.plot(r.history['accuracy'], label='acc')
plt.plot(r.history['val_accuracy'], label='val_acc')
plt.legend()

In [None]:
model.evaluate(data_train, y_train)

In [None]:
# model.evaluate(data_valid,y_valid)

In [None]:
model.evaluate(data_test,y_test)

In [None]:
predictions= model.predict(data_test)

In [None]:
predictions[:10]