In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import re
import string
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import Tokenizer

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**LOAD DATA**

In [None]:
#Get character encoding of the files
file = ["../input/covid-19-nlp-text-classification/Corona_NLP_train.csv", "../input/covid-19-nlp-text-classification/Corona_NLP_test.csv"]
for single in file:
    with open(single) as f:
        print(f.encoding)

In [None]:
train = pd.read_csv('/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_train.csv', encoding='latin-1')
test = pd.read_csv('/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_test.csv', encoding='latin-1')

**EDA + VISUALIZATIONS**

In [None]:
#Get summary of the training data
train.info()

In [None]:
#display first rows of the training dataset
train.head()

In [None]:
#Get summary of the testing data
test.info()

In [None]:
#display first rows of the testing dataset
test.head()

In [None]:
#combine the training and testing data t
frames = [train, test]
data = pd.concat(frames)
data.info()

In [None]:
#check for missing data 
missing = data['Location'].isnull()


print(data[missing])

In [None]:
#replace missing data 
data['Location'] = data['Location'].replace(np.nan, "Unknown")
#test['Location'] = test['Location'].replace(np.nan, "Unknown")

In [None]:
#check again for missing data
data[data['Location'].isnull()]

In [None]:
#check for duplicates in the data
dups = data.duplicated()
data[dups]

In [None]:
# Check for Locations distribution
loc_dist = data['Location'].unique()
print(len(loc_dist))

In [None]:
#summary of the whole data
data.info()

In [None]:
#sentiment Distribution

In [None]:
#plot total negative vs total positive
pos  = len(data[data['Sentiment'] == 'Positive'])
ext_pos = len(data[data['Sentiment'] == 'Extremely Positive'])

neut =len(data[data['Sentiment'] == 'Neutral'])

neg = len(data[data['Sentiment'] == 'Negative'])
ext_neg = len(data[data['Sentiment'] == 'Extremely Negative'])

total_positive = pos + ext_pos
total_negative = neg + ext_neg

tt_label = ["Total Positive", "Neutral", "Total Negative"]
tt = [total_positive, neut, total_negative]

In [None]:
plt.bar(tt_label, tt)
plt.show()

In [None]:
#plot
plt.pie(tt, labels=tt_label, autopct='%1.1f%%')
plt.show()

The charts above gives the summary of the whole data categorized into **3 classes**. 

The **positive** column comprises of the *Positive* and *Extremely positive* sentiments which acounts for **43.6%** of the total data. 

The **Negative** column, with **37.9%**, represents the total of the *Negative* and *Extremely Negative* sentiments, while the **Neutral** sentiments representing **18.5%** of the total data.

In [None]:
sentiment_count = data['Sentiment'].value_counts()
sentiment_count

In [None]:
#plot
plt.bar(sentiment_count.index,sentiment_count)
plt.xticks(rotation=90)
plt.show()

In [None]:
#plot
plt.pie(sentiment_count, labels=sentiment_count.index, autopct='%1.1f%%')
#plt.legend('upper right')
plt.show()

The plots above shows that Positive sentiments has the highest count at 27.5%, closely followed by the Negative sentiments data with 24.4% of the whole data. Neutral opininions occupy the center with about 18.5% of the records. Extremely Positive and Extremely Negative sentiments have values of 16.1% and 13.5% respectively.

In [None]:
#time series - start of date and end date - line plot frequency of positive and negative
day_total = data['TweetAt'].unique()
print(day_total)

# MODELS

**Logistic Regresson**

In [None]:
#vectorizer:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
train_matrix = vectorizer.fit_transform(train['OriginalTweet'])
test_matrix = vectorizer.transform(test['OriginalTweet'])

In [None]:
X_train1 = train_matrix
X_test1 = test_matrix
#y_train = train['sentiment']
#y_test = test['sentiment']

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

lab = train['Sentiment']
y_train1 = le.fit_transform(lab)
y_test1 = le.fit_transform(test['Sentiment'])

In [None]:
#The model
lr = LogisticRegression(max_iter=10000)

In [None]:
#Training the model
lr.fit(X_train1,y_train1)

In [None]:
#Make predictions
predictions1 = lr.predict(X_test1)


In [None]:
print(predictions1[:10])

In [None]:
#Show the results of the predictions
lab_names = test['Sentiment'].unique()
lab_names[predictions1[:10]]

In [None]:
# find accuracy, precision, recall:
from sklearn.metrics import confusion_matrix,classification_report
new = np.asarray(y_test1)
confusion_matrix(predictions1,y_test1)

In [None]:
print(classification_report(predictions1,y_test1))

**LSTM**

In [None]:
#Get Text data from the Tweet Column
corpus = data['OriginalTweet']
corpus

In [None]:
#One-Hot Encoding of the labels
sentiment = pd.get_dummies(data['Sentiment'])
print(sentiment)

In [None]:
print(len(corpus))
print(len(sentiment))

In [None]:
#Words Tokenization
from nltk.tokenize import word_tokenize

all_words = []
for sent in corpus:
    tokenize_word = word_tokenize(sent)
    for word in tokenize_word:
        all_words.append(word)

In [None]:
#Extract each word while ignoring duplicates
unique_words = set(all_words)
print(len(unique_words))

In [None]:

vocab_length = 101948
embedded_sentences = [one_hot(sent, vocab_length) for sent in corpus]
#print(embedded_sentences )

In [None]:
#count number of words
word_count = lambda sentence: len(word_tokenize(sentence))
longest_sentence = max(corpus, key=word_count)
length_long_sentence = len(word_tokenize(longest_sentence))

In [None]:
#Fill the end of each sentence with '0' so that they all have same lenght
padded_sentences = pad_sequences(embedded_sentences, length_long_sentence, padding='post')
print(padded_sentences)

In [None]:
len(padded_sentences)

In [None]:
#divide the data into Training and Testing

X_train,X_test, y_train, y_test = train_test_split(padded_sentences, sentiment, train_size=0.9, random_state=42)

In [None]:
#Build the Model 
model = Sequential()
model.add(Embedding(vocab_length, 20, input_length=length_long_sentence))
model.add(LSTM(20, return_sequences=True))
model.add(Dense(100, activation='relu'))
model.add(Flatten())
model.add(Dense(5, activation='softmax'))

In [None]:
#compile model and show summary
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
print(model.summary())

In [None]:
#train the model

model.fit(X_train, y_train, epochs=15, steps_per_epoch=200, verbose=1)

In [None]:
#Evaluate model performance
loss, accuracy = model.evaluate(padded_sentences, sentiment, verbose=0)
print('Accuracy: %f' % (accuracy*100))

In [None]:
#make predictions
predictions = model.predict(X_test)

In [None]:
#Store Predictions result
pred_result = (np.argmax(predictions[:20], axis=1))

In [None]:
#Show result of predictions
print(lab_names[pred_result])