<div style="display: block; height: 500px; overflow:hidden;position: relative">
     <img src="https://imgur.com/6I1AHP5.jpg" style="position: absolute;top: 0px;">
</div>

# 1.  Imports

In [None]:
# generics
import pandas as pd
import numpy as np
import random

# visu
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# texts
import re
import unicodedata
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical, plot_model

# Model
from tensorflow.keras import layers, Sequential
from tensorflow.keras.callbacks import EarlyStopping

# NLTK
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

# sklearn
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn import metrics

# 2.  Loading data

In [None]:
df_train = pd.read_csv("/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_train.csv", encoding="latin_1")
df_test = pd.read_csv("/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_test.csv", encoding="latin_1")

# 3.  Data overview

In [None]:
df_train.sample(5)

In [None]:
max_tweet_length = 0
tweet_length = []
#
for tweet in df_train["OriginalTweet"]:
    tweet_length.append(len(tweet))
    if len(tweet) > max_tweet_length:
        max_tweet_length = len(tweet)
print("Longest tweet: " + str(max_tweet_length) + " characters")

In [None]:
parameters = {'axes.labelsize': 20,
              'axes.titlesize': 30}
#
plt.rcParams.update(parameters)
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(18.5, 6)
sns.histplot(tweet_length, palette='Blues', stat='density', bins=50, ax=ax1);
sns.kdeplot(tweet_length, color='red', ax=ax1)
ax1.set_xlabel('Character count per tweet');
df_train["Sentiment"].reset_index().groupby("Sentiment").count().rename(columns={"index": "Count"}).sort_values(by= 
       "Count").plot(kind="barh", legend=False, 
        ax=ax2).grid(axis='x')
ax1.tick_params(axis='x', labelsize=16)
ax1.tick_params(axis='y', labelsize=16)
ax1.set_ylabel("")
ax1.set_title("Tweet length distribution", color ="#292421")
ax2.tick_params(axis='x', labelsize=16)
ax2.tick_params(axis='y', labelsize=16)
ax2.set_ylabel("")
ax2.set_title("Tweet sentiment count", color ="#292421")
fig.tight_layout(pad=2.0)
plt.rcParams.update(parameters)

# 4.  Turning 5 categories into 3 categories
Here we convert <b>extremely positive</b> tweets into <b style="color: green">positive</b> and <b>extremely negative</b> tweets into <b style="color: red">negative</b>.

In [None]:
def set_3_classes(x):
  if x=="Extremely Negative":
    return "Negative"
  elif x=="Extremely Positive":
    return "Positive"
  else:
    return x

In [None]:
df_train["Sentiment"] = df_train["Sentiment"].apply(set_3_classes)
df_test["Sentiment"] = df_test["Sentiment"].apply(set_3_classes)

In [None]:
fig, ax = plt.subplots()
fig.suptitle("Count", fontsize=12)
df_train["Sentiment"].reset_index().groupby("Sentiment").count().sort_values(by= 
       "index").plot(kind="barh", legend=False, 
        ax=ax).grid(axis='x')
plt.show()

# 5. Cleaning tweets

In [None]:
df_train["CleanTweet"] = df_train["OriginalTweet"]
df_train.sample(3)

<b>Removing end-of-line, tabulation and carriage return. Turning into lower case:</b>

In [None]:
def clean_eol_tabs(df, label):
    """ text lowercase
        removes \n
        removes \t
        removes \r """
    df[label] = df[label].str.lower()
    df[label] = df[label].apply(lambda x: x.replace("\n", " "))
    df[label] = df[label].apply(lambda x: x.replace("\r", " "))
    df[label] = df[label].apply(lambda x: x.replace("\t", " "))
    return df
#
df_train = clean_eol_tabs(df_train, "CleanTweet")

<b>Removing e-mails:</b>

In [None]:
def remove_emails(df, label):
    """ This function removes email adresses
        inputs:
         - text """
    df[label] = df[label].apply(lambda x: re.sub(r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])""", " ", x))
    return df
#
df_train = remove_emails(df_train, "CleanTweet")

<b>Removing mentions:</b>

In [None]:
def remove_mentions(df, label):
    """ This function removes mentions (Twitter - starting with @) from texts
        inputs:
         - text """
    df[label] = df[label].apply(lambda x: re.sub(r"@([a-zA-Z0-9_.-]{1,100})", " ", x))
    return df
#
df_train = remove_mentions(df_train, "CleanTweet")

<b>Removing hyperlinks:</b>

In [None]:
def remove_hyperlinks(df, label):
    """ This function removes hyperlinks from texts
        inputs:
         - text """
    df[label] = df[label].apply(lambda x: re.sub(r"http\S+", " ", x))
    return df
#
df_train = remove_hyperlinks(df_train, "CleanTweet")

<b>Removing hashtags:</b>

In [None]:
def remove_hashtags(df, label):
    """ This function removes hashtags
        inputs:
         - text """
    df[label] = df[label].apply(lambda x: re.sub(r"#\w+", " ", x))
    return df
#
df_train = remove_hashtags(df_train, "CleanTweet")

<b>Removing html tags:</b>

In [None]:
def remove_html_tags(df, label):
    """ This function removes html tags from texts
        inputs:
         - text """
    df[label] = df[label].apply(lambda x: re.sub(r"<.*?>", " ", x))
    return df
#
df_train = remove_html_tags(df_train, "CleanTweet")

<b>Removing numbers:</b>

In [None]:
def remove_numbers(df, label):
    """ This function removes numbers from a text
        inputs:
         - text """
    df[label] = df[label].apply(lambda x: re.sub(r"\d+", " ", x))
    return df
#
df_train = remove_numbers(df_train, "CleanTweet")

<b>Encode unknown characters:</b>

In [None]:
def encode_unknown(df, label):
    """ This function encodes special caracters """
    df[label] = df[label].apply(lambda x: unicodedata.normalize("NFD", x).encode('ascii', 'ignore').decode("utf-8"))
    return df
#
df_train = encode_unknown(df_train, "CleanTweet")

<b>Removing punctuations and special characters:</b><br>
*Note this function will remove punctuation AND accented characters. Thus it is not necessary usable on languages that have accented characters. But for english it is ok.*

In [None]:
def clean_punctuation_no_accent(df, label):
    """ This function removes punctuation and accented characters from texts in a dataframe 
        To be appplied to languages that have no accents, ex: english 
    """
    df[label] = df[label].apply(lambda x: re.sub(r'[^\w\s]', ' ', x))
    return df
#
df_train = clean_punctuation_no_accent(df_train, "CleanTweet")

<b>Removing stop words. Here, the list is from nltk stopwords library:</b>

In [None]:
def remove_stop_words(text, stopwords=set(stopwords.words('english'))):
    """ This function removes stop words from a text
        inputs:
         - stopword list
         - text """

    # prepare new text
    text_splitted = text.split(" ")
    text_new = list()
    
    # stop words updated
    #stopwords = stopwords.union({"amp", "grocery store", "covid", "supermarket", "people", "grocery", "store", "price", "time", "consumer"})
    
    # loop
    for word in text_splitted:
        if word not in stopwords:
            text_new.append(word)
    return " ".join(text_new)

def clean_stopwords(df, label):
    """ This function removes stopwords """
    df[label] = df[label].apply(lambda x: remove_stop_words(x))
    return df
#
df_train = clean_stopwords(df_train, "CleanTweet")

<b>Removing one and two letters words, removing unnecessary spaces, droping empty lines:</b>

In [None]:
def more_cleaning(df, label):
    """ This function
     1) removes remaining one-letter words and two letters words
     2) replaces multiple spaces by one single space
     3) drop empty lines """
    df[label] = df[label].apply(lambda x: re.sub(r'\b\w{1,2}\b', " ", x))
    df[label] = df[label].apply(lambda x: re.sub(r"[ \t]{2,}", " ", x))
    df[label] = df[label].apply(lambda x: x if len(x) != 1 else '')
    df[label] = df[label].apply(lambda x: np.nan if x == '' else x)
    df = df.dropna(subset=[label], axis=0).reset_index(drop=True).copy()
    return df
#
df_train = more_cleaning(df_train, "CleanTweet")

<b>Lemmatizing words:</b><br>
*Note: Here the lemmatizer works only for its default parameter which is <b>nouns</b>. That is to say, it will only find the closest root for nouns and will not work on verbs or adjectives ect ... I tried with lemmatization of everything but the accuracy was lower*

In [None]:
def lemmatize_one_text(text):
    """ This function lemmatizes words in text (it changes word to most close root word)
        inputs:
         - lemmatizer
         - text """

    # initialize lemmatizer
    lemmatizer = WordNetLemmatizer()
    
    # tags
    lem_tags = ['a', 'r', 'n', 'v']

    # prepare new text
    text_splitted = text.split(" ")
    text_new = list()

    # change bool
    changed = ''
    
    # loop
    for word in text_splitted:
        text_new.append(lemmatizer.lemmatize(word))
        #changed = ''
        #for tag in lem_tags:
        #    if lemmatizer.lemmatize(word, tag) != word:
        #        changed = tag
        #if changed == '':
        #    text_new.append(word)
        #else:
        #    text_new.append(lemmatizer.lemmatize(word, changed))

    return " ".join(text_new)

def lemmatize(df, label):
    """ This function lemmatizes texts """
    df[label] = df[label].apply(lambda x: lemmatize_one_text(x))
    return df
#
df_train = lemmatize(df_train, "CleanTweet")

In [None]:
df_train.sample(5)

### Let's apply all of these cleaning on test data set as well:

In [None]:
df_test["CleanTweet"] = df_test["OriginalTweet"]
df_test = clean_eol_tabs(df_test, "CleanTweet")
df_test = remove_emails(df_test, "CleanTweet")
df_test = remove_mentions(df_test, "CleanTweet")
df_test = remove_hyperlinks(df_test, "CleanTweet")
df_test = remove_hashtags(df_test, "CleanTweet")
df_test = remove_html_tags(df_test, "CleanTweet")
df_test = remove_numbers(df_test, "CleanTweet")
df_test = encode_unknown(df_test, "CleanTweet")
df_test = clean_punctuation_no_accent(df_test, "CleanTweet")
df_test = clean_stopwords(df_test, "CleanTweet")
df_test = more_cleaning(df_test, "CleanTweet")
df_test = lemmatize(df_test, "CleanTweet")

In [None]:
df_test.sample(3)

### Let's have a look on before/after cleaning on several tweets:

In [None]:
tweet_num = random.randint(0, df_train.shape[0])
print("############################# Original Tweet #############################")
print(df_train.iloc[tweet_num].at["OriginalTweet"])
print("\n")
print("############################# Clean Tweet ################################")
print(df_train.iloc[tweet_num].at["CleanTweet"])

In [None]:
tweet_num = random.randint(0, df_train.shape[0])
print("############################# Original Tweet #############################")
print(df_train.iloc[tweet_num].at["OriginalTweet"])
print("\n")
print("############################# Clean Tweet ################################")
print(df_train.iloc[tweet_num].at["CleanTweet"])

It looks good

# 6.  Looking at data

<b>Sentiment repartition:</b>

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(18.5, 5)
fig.suptitle('Sentiment repartition among tweets in train and test sets')
df_train["Sentiment"].value_counts().plot(kind="bar", ax=ax1);
df_test["Sentiment"].value_counts().plot(kind="bar", ax=ax2);

<b>Word cloud in each sentiment categories:</b>

In [None]:
all_words_positive = " ".join([text for text in df_train[df_train["Sentiment"]=="Positive"]["CleanTweet"]])
all_words_neutral = " ".join([text for text in df_train[df_train["Sentiment"]=="Neutral"]["CleanTweet"]])
all_words_negative = " ".join([text for text in df_train[df_train["Sentiment"]=="Negative"]["CleanTweet"]])

In [None]:
wordcloud_positive = WordCloud(width=800, height=600, max_font_size=120, background_color="white", colormap="Greens").generate(all_words_positive)
wordcloud_neutral = WordCloud(width=800, height=600, max_font_size=120, background_color="white", colormap="YlOrBr").generate(all_words_neutral)
wordcloud_negative = WordCloud(width=800, height=600, max_font_size=120, background_color="white", colormap="Reds").generate(all_words_negative)

In [None]:
parameters = {'axes.labelsize': 12,
              'axes.titlesize': 10}
#
fig, (ax1, ax2, ax3) = plt.subplots(1, 3)
fig.set_size_inches(18.5, 7)
ax1.imshow(wordcloud_positive, interpolation='bilinear')
ax1.axis("off")
ax1.set_title("WordCloud of positive tweets", fontsize=12)
ax2.imshow(wordcloud_neutral, interpolation='bilinear')
ax2.axis("off")
ax2.set_title("WordCloud of neutral tweets", fontsize=12)
ax3.imshow(wordcloud_negative, interpolation='bilinear')
ax3.axis("off")
ax3.set_title("WordCloud of negative tweets", fontsize=12)
plt.rcParams.update(parameters)
plt.show()

# 7. Sentiment encoding

In [None]:
df_train_encoded = df_train.copy()
df_test_encoded = df_test.copy()
#
print("train set shape: " + str(df_train_encoded.shape))
print("test set shape: " + str(df_test_encoded.shape))

In [None]:
map_sentiment = {"Neutral":0, "Positive":1,"Negative":2}
df_train_encoded['Sentiment'] = df_train_encoded['Sentiment'].map(map_sentiment)
df_test_encoded['Sentiment']  = df_test_encoded['Sentiment'].map(map_sentiment)

# 8. Feature and target preparation

In [None]:
y_train = df_train['Sentiment'].copy()
y_test = df_test['Sentiment'].copy()
#
y_train_encoded = to_categorical(df_train_encoded['Sentiment'], 3)
y_test_encoded = to_categorical(df_test_encoded['Sentiment'], 3)
#
y_train_mapped = df_train_encoded['Sentiment'].copy()
y_test_mapped = df_test_encoded['Sentiment'].copy()
#
X_train = df_train_encoded[['CleanTweet']].copy()
X_test = df_test_encoded[['CleanTweet']].copy()

# 9. Tokenization, sequences, padding

<b>The result of tokenizer is a dictionnary with:</b><br>
* key = word<br>
* value = unique number

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train["CleanTweet"])
vocab_length = len(tokenizer.word_index) + 1
vocab_length

The <b>texts_to_sequences</b> function first transforms a text into list of words. Then, thanks to the dictionnary previously created by the tokenizer (see above), transforms list of list of words into list of list of numbers

In [None]:
X_train = tokenizer.texts_to_sequences(X_train["CleanTweet"])
X_test = tokenizer.texts_to_sequences(X_test["CleanTweet"])

Each tweets has differents length. Thus the result of the <b>texts_to_sequences</b> function will be a list of list of numbers of different length: 

In [None]:
print("First tweet encoded:")
print(X_train[0])
print("\nSecond tweet encoded:")
print(X_train[1])
print("\nThird tweet encoded:")
print(X_train[2])

To feed the deep learning model, we need all these lists to be the same length. Thus we need to apply padding. In other words, we are going to add several zeros (0) at the end of the shortest tweets so that at the end, all of our lists have the same length. <br><br>
First let's get the maximum number of words in one tweet:

In [None]:
max_word_count = 0
word_count = []
#
for encoded_tweet in X_train:
    word_count.append(len(encoded_tweet))
    if len(encoded_tweet) > max_word_count:
        max_word_count = len(encoded_tweet)
print("Maximum number of word in one tweet: " + str(max_word_count) + " words")

In [None]:
parameters = {'axes.labelsize': 20,
              'axes.titlesize': 30}
#
plt.rcParams.update(parameters)
fig, ax1 = plt.subplots(1, 1)
fig.set_size_inches(18.5, 8)
sns.histplot(word_count, palette='Blues', stat='density', bins=30, ax=ax1);
sns.kdeplot(word_count, color='red', ax=ax1)
ax1.set_xlabel('Word count per tweet');
ax1.tick_params(axis='x', labelsize=16)
ax1.tick_params(axis='y', labelsize=16)
ax1.set_ylabel("")
ax1.set_title("Tweet length distribution", color ="#292421")
fig.tight_layout(pad=2.0)
plt.rcParams.update(parameters)

So the longest tweet we have is composed of 37 words. We are going to pad the sequences with a maximum length of 37.

In [None]:
X_train = pad_sequences(X_train, maxlen=max_word_count, padding='post')
X_test = pad_sequences(X_test, maxlen=max_word_count, padding='post')
X_train.shape

Let's have a look at the encoded 3 tweets after padding:

In [None]:
print("First tweet encoded:", "Size = ", len(X_train[0]))
print(X_train[0])
print("\nSecond tweet encoded:", "Size = ", len(X_train[1]))
print(X_train[1])
print("\nThird tweet encoded:", "Size = ", len(X_train[2]))
print(X_train[2])

Now every encoded tweet has the same length, the data is ready for the model.

# 10. Model

<b>The model is composed of:</b>
1. <b>An embedding layer with parameters</b>
    * input dim = vocabulary size
    * output dim = 32
    * input length = size of padded sequences
    * mask_zero = True to ignore 0 (from padding)
2. <b>An LSTM (Long Short Term Memory) Layer with parameter</b>
    * units = 100 (don't ask me why, the resulting accuracy is almost the same regardless this value)
3. <b>Three Dense layers</b>
4. <b>An output dense layer with parameters</b>
    * units = 3 (output dim)
    * activation = softmax (for multiclassification problem)

<b>Compilation with parameters:</b>
1. loss = categorical_crossentropy (for multiclassification problem)
2. optimizer = adam
3. metrics = accuracy

In [None]:
model_LSTM = Sequential()
model_LSTM.add(layers.Embedding(vocab_length, output_dim=32, input_length=max_word_count, mask_zero=True))
model_LSTM.add(layers.LSTM(100))
model_LSTM.add(layers.Dense(64, activation="relu"))
model_LSTM.add(layers.Dense(32, activation="relu"))
model_LSTM.add(layers.Dense(16, activation="relu"))
model_LSTM.add(layers.Dense(3, activation='softmax'))
model_LSTM.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model_LSTM.summary())

Here I set an early stopping after 10 epochs and set the parameter <i><b>restore_best_weights</b></i> to <b style="color:green">True</b> so that the weights of best score on monitored metric - here <b>val_accuracy</b> (accuracy on test set) - are restored when training stops. This way the model has the best accuracy possible on unseen data.

In [None]:
es = EarlyStopping(patience=10, monitor='val_accuracy', restore_best_weights=True)
history = model_LSTM.fit(X_train,
                         y_train_encoded,
                         validation_data=(X_test, y_test_encoded),
                         epochs=30,
                         batch_size=16,
                         verbose=1,
                         callbacks=[es]
                        )

We can see that after epoch 2, the accuracy on test set - val_accuracy - doesn't increase any more while accuracy on train set continues to increase untill almost 100%! The model is overfitting from epoch 2 and is not able to generalize well on unseen data from there.

# 11. Model Evaluation

<b>Prediction on test set:</b>

In [None]:
predicted = model_LSTM.predict(X_test)
y_pred = predicted.argmax(axis=-1)

<b>Calculation of accuracy and Area Under (ROC) Curve - AUC - scores:</b>

In [None]:
acc_score = accuracy_score(y_test_mapped, y_pred)
auc_score = roc_auc_score(y_test_mapped, predicted, multi_class="ovr")

<b>Classification report:</b>

In [None]:
report = classification_report(y_test_mapped, y_pred, target_names=list(y_test.unique()), output_dict=True)
accuracy_col = ([""]*3) + [round(acc_score, 2)]
roc_auc_col = ([""]*3) + [round(auc_score, 2)]
accuracy_col = pd.Series(accuracy_col, index=list(report["Neutral"].keys()))
roc_auc_col = pd.Series(roc_auc_col, index=list(report["Neutral"].keys()))
df_report = pd.DataFrame(report)[["Neutral", "Positive", "Negative", "macro avg", "weighted avg"]].apply(lambda x: round(x, 2))
df_report["accuracy"] = accuracy_col
df_report["roc_auc"] = roc_auc_col
df_report

<b>Confusion Matrix:</b>

In [None]:
## Plot confusion matrix
cm = confusion_matrix(y_test_mapped, y_pred)
fig, ax = plt.subplots()
fig.set_size_inches(12, 8)
sns.heatmap(cm, annot=True, fmt='d', ax=ax, cmap=plt.cm.Blues, cbar=False)
ax.set(xticklabels=list(y_test.unique()), yticklabels=list(y_test.unique()), title="Confusion matrix")
ax.tick_params(axis='x', labelsize=16)
ax.tick_params(axis='y', labelsize=16)
ax.set_ylabel("True", color="royalblue", fontsize=35, fontweight=700)
ax.set_xlabel("Prediction", color="royalblue", fontsize=35, fontweight=700)
plt.yticks(rotation=0);

<b>ROC and precision-recall curves</b>

In [None]:
y_test_array = pd.get_dummies(y_test_mapped, drop_first=False).values
classes = y_train.unique()

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2)
fig.set_size_inches(18.5, 5)
## Plot roc
for i in range(len(classes)):
    fpr, tpr, thresholds = roc_curve(y_test_array[:,i], predicted[:,i])
    ax[0].plot(fpr, tpr, lw=3, 
              label='{0} (area (AUC) = {1:0.2f})'.format(classes[i], metrics.auc(fpr, tpr))
               )
ax[0].plot([0,1], [0,1], color='navy', lw=3, linestyle='--')
ax[0].set(xlim=[-0.05,1.0], ylim=[0.0,1.05], 
          xlabel='False Positive Rate', 
          ylabel="True Positive Rate (Recall)", 
          title="Receiver operating characteristic")
ax[0].legend(loc="lower right")
ax[0].grid(True)

## Plot precision-recall curve
for i in range(len(classes)):
    precision, recall, thresholds = metrics.precision_recall_curve(
                 y_test_array[:,i], predicted[:,i])
    ax[1].plot(recall, precision, lw=3, 
               label='{0} (area ={1:0.2f})'.format(classes[i], metrics.auc(recall, precision))
              )
ax[1].set(xlim=[0.0,1.05], ylim=[0.0,1.05], xlabel='Recall', 
          ylabel="Precision", title="Precision-Recall curve")
ax[1].legend(loc="best")
ax[1].grid(True)
plt.show()

<div style="font-size:15pt; color:#104E8B; font-weight:700; width:80%; display:block; margin:auto; text-align:center">Thanks for reading, I hope you enjoyed it. If there is anything wrong or if you have any suggestions for improvement, please feel free to comment, I'll be glad to get feedback to improve.</div>