In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns

import matplotlib.pyplot as plt
from nltk.tokenize import TweetTokenizer 
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.util import ngrams
from collections import Counter

import keras
from keras.models import Sequential
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV


from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split


from keras.preprocessing.sequence import pad_sequences
import re
from sklearn.metrics import confusion_matrix

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data Reading In and Exploratory analysis

### Reading the data in

In [None]:
train = pd.read_csv('../input/covid-19-nlp-text-classification/Corona_NLP_train.csv',encoding='latin_1')
test = pd.read_csv('../input/covid-19-nlp-text-classification/Corona_NLP_test.csv',encoding='latin_1')

### initial checks on the data set, missing data and null values

In [None]:
train.info()

In [None]:
print(train.head(20))

In [None]:
print(train.iloc[16])

All the fields except the Location field have no missing values.

The UserName and ScreenName columns have been anonymized for privacy so they are not useful to our analysis. 

In [None]:
train.drop(labels = ['UserName','ScreenName'],axis=1, inplace=True)
test.drop(labels = ['UserName','ScreenName'],axis=1, inplace=True)

In [None]:
train

### Dupicates and Null Values 

In [None]:
sns.heatmap(train.isnull())

In [None]:
sns.heatmap(test.isnull())

The Null values are mostly in the Loction variable, hence the loacation variable will be utilized for EDA and dropped before modelling begins. Dupilcates will be dropped to reduce Variance.

In [None]:
train.drop_duplicates(inplace= True)
test.drop_duplicates(inplace=True)


In [None]:
print('Train data shape: ',train.shape)
print('Test data shape: ',test.shape)


### Checking the distibrution of Dependent Variable 

In [None]:
sns.set_style("darkgrid")
sns.set(rc={'figure.figsize':(10,5)})
sns.countplot(train['Sentiment'])

In [None]:
train.Sentiment.value_counts()

The other variables will now be analysed in reference to the Target variable, starting with the Location. We will examine the top 20 locations; But before we do that we have to change the Extremely Positive and Extremely negative classes to positive and negative classes respectively using a string function to make future tasks easier 

I attempted devising an ordinal scale system to rank the sentiment but couldn't deal with the bugs within the time specified 

In [None]:
train['Sentiment'] = train.Sentiment.str.replace('Extremely Positive', 'Positive')
train['Sentiment'] = train.Sentiment.str.replace('Extremely Negative', 'Negative')

test['Sentiment'] = test.Sentiment.str.replace('Extremely Positive', 'Positive')
test['Sentiment'] = test.Sentiment.str.replace('Extremely Negative', 'Negative')

In [None]:
sns.countplot(train['Sentiment'])

The Negative and Positive classes are fairly balanced, with the neutral variable with a much lower percentage. Undersampling or Oversampling techniques will be applied if the models to be built are skewed by this. 

The methodology that was used to determine the sentiment of the tweets wasn't published on the dataset page so certain assumptions cannot be made

----------------------------------------------------------------------------------------------------------------------

Now that the sentiment class has been simplified, we can begin analysing the other Variable in respect to it; the Location, Time and ultimately the Tweet corpus that will require alot more cleaning

### Location Analysis

In [None]:
train['Location'].value_counts(dropna = False)[:40]

A large section of the location data is missing. The rest of the data doesn't follow a consistent pattern because twitter allows for a flexible location setting. So we start by attempting to make the Location bar as consistent as possible by splitting word pairs; e.g changing 'Nairobi, Kenya' to Nairobi 

In [None]:
#splitting location into word pairs
train['Location'] = train['Location'].str.split(",").str[0]

In [None]:
train['Location'].value_counts()[:20]

In [None]:
#selecting the location and sentiment columns in to a df for plotting
plot_df = train.iloc[:,[0,3]]

In [None]:
sns.set(rc={'figure.figsize':(15,6)})
gg = train.Location.value_counts()[:10].index
plt.title("Tweet count across top  cities")
sns.countplot(x = "Location", hue = "Sentiment", data = plot_df, order = gg)

### Insight
All major cities follow the same trend across the sentiments except England, where Negative sentiments outnumber the positive and neutral.

The graph shows that most tweets contain positive content accross the board except England as a country, which is interesting because the location with the highest tweet count is London and the distribution there follows the major trend. 

The top locations are located in the United Kingdom, America, Canada and India. The dataset description does not say much about how the data was gathered. However most top cities follow the same trend.

Another caveat: the location data is not representative of the entire dataset due to missing data.

-------------------------------------------------------------------------------------------------------------------------
### Time / Date Analysis
We examine the tweets across different time periods

In [None]:
#Converting the time column to date time for easy analysis
train["TweetAt"] = pd.to_datetime(train["TweetAt"])

In [None]:
print('Earliest date: ', train['TweetAt'].min())
print('Latest Date', train['TweetAt'].max())

The tweets were collected across an 8 month range

Next, we examine the tweet count across the days of the week

In [None]:

train["TweetAt"] = pd.to_datetime(train["TweetAt"])
train["day"] = train["TweetAt"].apply(lambda x : x.dayofweek)
key = {0: 'Monday', 1: 'Tuesday', 2:'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday'}
train["day"] = train["day"].map(key)
plt.title("Tweet count across days")
sns.countplot(train["day"])

In [None]:
sns.set(rc={'figure.figsize':(17,6)})
train["TweetAt"] = pd.to_datetime(train["TweetAt"])
train["month"] = train["TweetAt"].apply(lambda x : x.month)
key = {0: 'January ', 1: 'February', 2:'March', 3: 'April', 4: 'May', 5: 'June', 6: 'July',
      7:'August', 8:'September', 9: 'October', 10: 'November', 11: 'December'  }
train["month"] = train["month"].map(key)
plt.title("Tweet count across months")
sns.countplot(train["month"])

### Insight
The distribution is heavily skewed to the right when referenced with the death toll image below... 

# Paste Image Here
https://ourworldindata.org/covid-deaths?country=IND~USA~GBR~CAN~DEU~FRA

Image Citation: 
Max Roser, Hannah Ritchie, Esteban Ortiz-Ospina and Joe Hasell (2020) - "Coronavirus Pandemic (COVID-19)". Published online at OurWorldInData.org. Retrieved from: 'https://ourworldindata.org/coronavirus' [Online Resource]

...it is noticed that it follows a very similar trend. Perhaps the high number of tweets in April came as result of the high number of cases and deaths in cities such as London and New York.

In [None]:
train['month'].value_counts()

## Corpus Analysis

Time to examine the tweets; Hashtags, mentions and top words 

In [None]:
#examination of tweets
for x in train.loc[:50, 'OriginalTweet']:
    print(x)
    print('\n')
    print('***********************************\n')


Tweets contain mentions, hashtags, links, numbers and non english characters(take the cell below)

In [None]:
 train.loc[16, 'OriginalTweet']

### The top 20 hashtags

In [None]:
#extracting top hashtags using regex
hashtags=train['OriginalTweet'].str.extractall(r"(#\S+)")
freqs = hashtags[0].value_counts()
freqs[:20]


### Lets look at the sentiments of the tweets containing the most prominent hashtag

In [None]:
#regex function to find all rows that contain #coronavirus hashtag
hashtag = train[train.OriginalTweet.str.contains(pat ='#coronavirus ')]
plt.title("Sentiment in tweets that trend #coronavirus")
sns.countplot(hashtag.Sentiment)


### Insight

Representative of the general trend

### Top Mentions

In [None]:
mentions = train['OriginalTweet'].str.extractall(r"(@\S+)")
mentions = mentions[0].value_counts()
mentions[:20]

In [None]:
#regex function to find all rows that contain trump
result = train.OriginalTweet.str.contains(pat ='@realDonaldTrump')
trump=train[result]
plt.title("Sentiment in tweets that mention Trump")
sns.countplot(trump.Sentiment)


### Insight

Almosts follows the general trend but the sentiments are more polarized. The lower neutral sentiment and very close Positive and Negative sentiments are indicative of polarization. 

### Cleaning the data
In order to perform text mining, the following processes have to be applied.
* Clean data, remove links and numbers
* Remove Stopwords
* Vectorize words

In [None]:
#Function to clean data

stop_word = stopwords.words('english')
def clean_data(df): 
    df.OriginalTweet = df.OriginalTweet.str.replace(r'(@\w*)','')

    #Removes URLs in the tweets
    df.OriginalTweet = df.OriginalTweet.str.replace(r"http\S+", "")

    #Remove hashtags
    df.OriginalTweet = df.OriginalTweet.str.replace(r'#\w+',"")

    #Removes uniques characters
    df.OriginalTweet = df.OriginalTweet.str.replace(r"[^a-zA-Z ]","")

    # Remove all extra spaces
    df.OriginalTweet = df.OriginalTweet.str.replace(r'( +)'," ")
    df.OriginalTweet = df.OriginalTweet.str.strip()

    # Changes characters to lowercase
    df.OriginalTweet = df.OriginalTweet.str.lower()
    
    return df
    
   

In [None]:
#applying cleaning function on train and test dataframes

temptr = train.iloc[:,[2,3]]
clean_train = clean_data(temptr)
tempte = test.iloc[:,[2,3]]
clean_test = clean_data(tempte)






In [None]:
def remove_stopwords(corpus):
    corpus = corpus.split()
    corpus = " ".join([word for word in corpus if not word in stop_word])
    return corpus

In [None]:
clean_train.head(10)

In [None]:
#Lambda function to remove stopwords
clean_train['OriginalTweet'] = clean_train['OriginalTweet'].apply(lambda x: remove_stopwords(x))
clean_test['OriginalTweet'] = clean_test['OriginalTweet'].apply(lambda x: remove_stopwords(x))

In [None]:
#replacing empty strings with Nan
clean_train = clean_train.replace(r'^\s*$', np.NaN, regex=True)
clean_test = clean_test.replace(r'^\s*$', np.NaN, regex=True)



In [None]:
clean_train.OriginalTweet[16]

In [None]:
#dropping nan
clean_train.dropna(inplace = True)
clean_test.dropna(inplace = True)

In [None]:
clean_train.info()

In [None]:
#examination of tweets
for x in clean_train.loc[:50, 'OriginalTweet']:
    print(x)
    print('\n')
    print('***********************************\n')



The data is free from links, hashtags, mentions and figures. We can begin modelling

In [None]:
#token = nltk.word_tokenize(text)

In [None]:
Corpus = ' '.join([i for i in clean_train['OriginalTweet']]).split() 

In [None]:
Corpus[1]

In [None]:
#TweetTokenizer()

## Unigrams, Bigrams and Trigrams
Ngrams show the relationships and probabilistic tendences that certain words appear together

In [None]:
unigram = pd.Series(nltk.ngrams(Corpus, 1)).value_counts()[:15]
unigram = pd.DataFrame(unigram)
unigram['sn'] = unigram.index
unigram

In [None]:
bigram = pd.Series(nltk.ngrams(Corpus, 2)).value_counts()[:15]
bigram = pd.DataFrame(bigram)
bigram

In [None]:
trigram = pd.Series(nltk.ngrams(Corpus, 3)).value_counts()[:15]
trigram = pd.DataFrame(trigram)
trigram

## Data Preprocessing
### Tokenization

In [None]:
tokenizer = Tokenizer()

In [None]:
def tokenize(df):
    nltk_tokens = nltk.word_tokenize(df)
    return nltk_tokens

clean_train['tokens'] =  clean_train['OriginalTweet'].apply(lambda x: tokenize(x))
clean_train.tokens = clean_train.apply(lambda x: " ".join(x.tokens),axis=1)


clean_test['tokens'] =  clean_test['OriginalTweet'].apply(lambda x: tokenize(x))
clean_test.tokens = clean_test.apply(lambda x: " ".join(x.tokens),axis=1)

In [None]:
clean_test.columns

In [None]:
clean_test

### Vectorization 

In [None]:
from sklearn.preprocessing import OrdinalEncoder
ord_enc = OrdinalEncoder()
clean_train["y_nominal"] = ord_enc.fit_transform(clean_train[["Sentiment"]])
clean_test["y_nominal"] = ord_enc.fit_transform(clean_test[["Sentiment"]])

X_train, X_test, y_train, y_test = train_test_split(clean_train['tokens'],
                                                    clean_train['y_nominal'], test_size=0.2,random_state=42)

In [None]:
clean_train.shape

In [None]:
#initialize vectorizer
vector = CountVectorizer(stop_words='english',ngram_range=(1,2),min_df=5).fit(clean_train['tokens'])
#initially min_df = 5

# Transforms a collection of text documents into a matrix of token counts
x_train_vectorized = vector.transform(X_train)
x_test_vectorized = vector.transform(X_test)

In [None]:
ord_enc.categories_

## Modeling
### Logistic Regression

Usually a model selection techniques would occur here, but since the model to be used was explicitly stated; we jump straight into training and parameter tuning using Gridsearch.

In [None]:
LR = LogisticRegression()

# Create regularization penalty space
penalty = ['l2']

# Create regularization hyperparameter space
C = np.logspace(0, 4, 10)

# Create hyperparameter options
hyperparameters = dict(C=C, penalty=penalty)

In [None]:
# Create grid search using 5-fold cross validation
clf = GridSearchCV(LR, hyperparameters, cv=5, verbose=0)

In [None]:
x_train_vectorized.shape

In [None]:
# Fit grid search to find the best model
best_model = clf.fit(x_train_vectorized, y_train)

In [None]:
# View best hyperparameters
print('Best Penalty:', best_model.best_estimator_.get_params()['penalty'])
print('Best C:', best_model.best_estimator_.get_params()['C'])

## Results

In [None]:
#Classification Report
y_pred = best_model.predict(x_test_vectorized)


In [None]:
report = classification_report(y_test, y_pred)
print(report)

#confusion matrix
print(confusion_matrix(y_test, y_pred))

In [None]:
sns.heatmap(confusion_matrix(y_test, y_pred))

In [None]:
#ROC CURVE
import scikitplot as skplt
import matplotlib.pyplot as plt

y_probas = best_model.predict_proba(x_test_vectorized)
skplt.metrics.plot_roc(y_test, y_probas)
plt.show()

### Results Discussion
The metric of choice for evaluation is 
> F1 Score

it is the harmonic mean between precision and recall and gives us a better measure.
The positive class was the most accurately predicted, then Negative and Neutral across all metrics Precision, Recall and F1. 

The results are consistent with the amount of tweets available per class. Positive class highest accuracy, followed by Negative and Neutral. It is a simply a matter or more data. To get better scores on the neutral class or overall, the following options can be considerered

* **Collect more data**: more 
* **Sampling**: This can be done in a number of ways.The majority classes (in this case Positive and Negative) can be undersampled. Equally, the minority class can be Oversampled to match the majority. The weights of the different classes can also be modified for uniformity. Weighing the input of the minority class higher than the majority classes or vice versa. 
* Utilize other word vectorizers: Vectorizers like TFid, Word2Vec or frameworks like Gensim can be utilized. 
* More Parameter tuning: The model can be finetuned further to slightly improve scores 



### LSTM
A seperate tokenizer was used to fit the LSTM. The sequences were padded with zeros for uniformity across the dataset.

In [None]:
tokenizerlstm = Tokenizer( split=' ')
tokenizerlstm.fit_on_texts(clean_train['OriginalTweet'].values)

X = tokenizerlstm.texts_to_sequences(clean_train['OriginalTweet'])

X = pad_sequences(X)

In [None]:
vocabSize = len(tokenizerlstm.word_index) + 1
vocabSize

The LSTM model was created with a sequential Keras object. It contains an Embedded layer, LSTM and sense layer for output.

In [None]:
embed_dim = 128
lstm_out = 196
model = Sequential()
model.add(Embedding(vocabSize, embed_dim,input_length = 28))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3,activation='softmax'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])

The target variable was binarized to correspond with the logit pairs for the LSTM model. 

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing

Le = LabelEncoder()
lb = preprocessing.LabelBinarizer()

y_train = lb.fit_transform(clean_train['Sentiment'])

#y_train = np.asarray(y_train).astype('float32').reshape((-1,1))
#y_test = np.asarray(test_labels).astype('float32').reshape((-1,1))

In [None]:
y_test = lb.fit_transform(clean_test['Sentiment'])

In [None]:
y_train.shape

Model Training

In [None]:
#atempted memory error fix
'''
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession

config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)'''

In [None]:
X_trainl, X_testl, y_trainl, y_testl = train_test_split(X,y_train, test_size = 0.15, random_state = 42)

model.fit(X_trainl, y_trainl,validation_data = (X_testl,y_testl),epochs = 10, batch_size=32)

In [None]:
#tokenizerlstm = Tokenizer( split=' ')
#tokenizerlstm.fit_on_texts(clean_train['OriginalTweet'].values)

test_lstm = tokenizerlstm.texts_to_sequences(clean_test['OriginalTweet'])

test_lstm = pad_sequences(test_lstm)

In [None]:
lstm_pred = model.predict_classes(test_lstm)

In [None]:
print(classification_report(np.argmax(y_test,1), lstm_pred))

In [None]:
print(confusion_matrix(np.argmax(y_test,1), lstm_pred))

### Results Discussion
Interstingly the Logistic Regression Classifier performed almost the same as the LSTM Model.
The results are near identical except the macro average recall where the LSTM model edged out the LR model by 0.01. However the LSTM model used was not state of the art, presumably the results would have been better. 

Similar recommendations given to LR model can be applied to the LSTM model, such as gathering more data, sampling and other word vectorizers.

Most importantly, the model structure. No single LSTM is the best for text processing as finding the best model is a [leading topic in the literature](https://scholar.google.com/scholar?q=lstm+for+nlp+papers&hl=en&as_sdt=0&as_vis=1&oi=scholart). Hence other model structures can be experimented with in the future.


