# Imports

In [None]:
import tensorflow as tf
import os
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D, Dense, LSTM, Conv1D, Embedding
from wordcloud import WordCloud

# test Import self-crawled tweets about the COVID-19 vaccine
I managed to scrape about 80.000 unique tweets. The twitter API is quite limited when running a free dev account. It may not be the largest dataset. However i feel like the main objective of the project is to show what I have learned in TDDE16, and perhaps it's ok that my dataset is not the most robust. 

In [None]:
import glob
path = '../input/tweets-about-covid19-vaccine'         # use your path
all_files = glob.iglob(os.path.join(path, "*.csv"))     # advisable to use os.path.join as this makes concatenation OS independent
df_from_each_file = (pd.read_csv(f) for f in all_files)
df = pd.concat(df_from_each_file, ignore_index=True)

# As the data has no column titles, we will add our own
#df.columns = ["username", "acc_desc", "location", "following", "followers", "totaltweets", "usercreatedts", "tweetcreatedts", "retweetcount", "text", "hashtags"]




In [None]:
# Nr of tweets
len(df)

In [None]:
# Show the first 10 rows of the dataframe.
df.head()

# Preprocessing tweets
First let's define the preprocessing func

In [None]:
# Reading contractions.csv and storing it as a dict.
contractions = pd.read_csv('../input/contractions/contractions.csv', index_col='Contraction')
contractions.index = contractions.index.str.lower()
contractions.Meaning = contractions.Meaning.str.lower()
contractions_dict = contractions.to_dict()['Meaning']

# Defining regex patterns.
urlPattern        = r"((http://)[^ ]*|(https://)[^ ]*|(www\.)[^ ]*)"
userPattern       = '@[^\s]+'
hashtagPattern    = '#[^\s]+'
alphaPattern      = "[^a-z0-9<>]"
sequencePattern   = r"(.)\1\1+"
seqReplacePattern = r"\1\1"

# Defining regex for emojis
smileemoji        = r"[8:=;]['`\-]?[)d]+"
sademoji          = r"[8:=;]['`\-]?\(+"
neutralemoji      = r"[8:=;]['`\-]?[\/|l*]"
lolemoji          = r"[8:=;]['`\-]?p+"

def preprocess_apply(tweet):

    tweet = str(tweet).lower()

    # Replace all URls with '<url>'
    tweet = re.sub(urlPattern,'<url>',tweet)
    # Replace @USERNAME to '<user>'.
    tweet = re.sub(userPattern,'<user>', tweet)
    
    # Replace 3 or more consecutive letters by 2 letter.
    tweet = re.sub(sequencePattern, seqReplacePattern, tweet)

    # Replace all emojis.
    tweet = re.sub(r'<3', '<heart>', tweet)
    tweet = re.sub(smileemoji, '<smile>', tweet)
    tweet = re.sub(sademoji, '<sadface>', tweet)
    tweet = re.sub(neutralemoji, '<neutralface>', tweet)
    tweet = re.sub(lolemoji, '<lolface>', tweet)

    for contraction, replacement in contractions_dict.items():
        tweet = tweet.replace(contraction, replacement)

    # Remove non-alphanumeric and symbols
    tweet = re.sub(alphaPattern, ' ', tweet)

    # Adding space on either side of '/' to seperate words (After replacing URLS).
    tweet = re.sub(r'/', ' / ', tweet)
    return tweet

## Apply preprocessing
A new column with processed text will be added to the df

In [None]:
df['processed_tweet'] = df.text.apply(preprocess_apply)
X_list = np.array(df['processed_tweet'])

## Inspect tweets

In [None]:
df.head()

## Display most frequently used words

In [None]:
plt.figure(figsize = (15,20))
wc = WordCloud(max_words = 1000 , width = 1600 , height = 800,  background_color ='white', min_font_size = 25,
               collocations=False).generate(" ".join(list(df['processed_tweet'])))      
plt.axis("off") 

plt.imshow(wc , interpolation = 'bilinear')

# Tokenizing preprocessed tweets
The LSTM model cannot process text direcetly. Tweets must first be tokenized and then padded so that all input has equal length (set to 60 during training)

In [None]:
import pickle

# import trained tokenizer 
filename = "../input/bilstm-15-model-epochs-trained-on-sentiment140/Tokenizer.pickle"
with open(filename, 'rb') as f:
    tokenizer = pickle.load(f)

vocab_length = len(tokenizer.word_index) + 1
input_length = 60
print("Tokenizer vocab length:", vocab_length)
    


In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_tokenized = pad_sequences(tokenizer.texts_to_sequences(X_list), maxlen=input_length)

print("X.shape:", X_tokenized.shape)

# Define the same Bi-LSTM model architecture as used during training

In [None]:
def getModel():
    embedding_layer = Embedding(input_dim = vocab_length,
                                output_dim = 100, # Dimensoin chosen for the Word2Vec model used in the embedding layer 
                                input_length=60,# Max word length of tweets manually set when padding input 
                                trainable=False)

    model = Sequential([
        embedding_layer,
        Bidirectional(LSTM(100, dropout=0.3, return_sequences=True)),
        Bidirectional(LSTM(100, dropout=0.3, return_sequences=True)),
        Conv1D(100, 5, activation='relu'),
        GlobalMaxPool1D(),
        Dense(16, activation='relu'),
        Dense(1, activation='sigmoid'),
    ],
    name="Sentiment_Model")
    return model

# Create model

In [None]:
training_model = getModel()
training_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
training_model.summary()

# Load weights from previous training
Loading weights from the model that was trained for 15 epochs achieveing about 85% accuracy on the sentiment140 dataset

In [None]:
checkpoint_path = '../input/bilstm-15-model-epochs-trained-on-sentiment140/Twitter-Sentiment-LSTM/variables/variables'
checkpoint_dir = os.path.dirname(checkpoint_path)
!ls {checkpoint_dir}

In [None]:
# store weights before loading pre-trained weights
preloaded_layers = training_model.layers.copy()
preloaded_weights = []
for pre in preloaded_layers:
    preloaded_weights.append(pre.get_weights())

# load pre-trained weights
try: 
    training_model.load_weights(checkpoint_path)

    print("Loading weights from prev training\n")

    # compare previews weights vs loaded weights
    for layer, pre in zip(training_model.layers, preloaded_weights):
        weights = layer.get_weights()

        if weights:
            if np.array_equal(weights, pre):
                print('not loaded', layer.name)
            else:
                print('loaded', layer.name)
                found_prev_weights = True

except:
    found_prev_weights = False
    print("Error: no weights found")

# Now let's predict

In [None]:
""" 
Will use the LSTM model to predict the sentiment of a tweet. 
"""
def predict(X):
    pred = training_model.predict(X)
    return pred

In [None]:
predictions = predict(X_tokenized)

## Calculate percentage of negative and positive predictions
Later we will filter out predictions that the model is quite uncertain about. These are intepreted as neutral. But before filtering out any neutral predictions, let's check the distribution when we consider all values < 0.5 to be negative and => 0.5 to be positive.

In [None]:
predictions_unfiltered = np.where(predictions>=0.5, 1, 0)

In [None]:
nr_pos = np.sum(predictions_unfiltered == 1)
nr_neg = np.sum(predictions_unfiltered == 0)
nr_tot = len(predictions_unfiltered)

percentage_pos = nr_pos / nr_tot
percentage_neg = nr_neg / nr_tot

print("nr_pos:", nr_pos)
print("nr_neg:", nr_neg)
print("nr_tot:", nr_tot)

print("percentage_pos:", percentage_pos)
print("percentage_neg:", percentage_neg)

## Now let's filter out neutral predictions
To support the robustness of the classifier, classifications that are not assigned
a minimum probability of 75% being in the positive or negative class are interpreted
as neutral. Therefore a value below 0.25 is considered negative (0), above 0.75 positive (1)
and values inbetween 0.25 - 0.75 as neutral (8). 

## Calculate percentage of negative and positive predictions again

In [None]:
predictions_filtered = np.where(predictions >=0.75, 1, 
         (np.where(predictions <= 0.25, 0, 8)))

In [None]:
nr_neutral = np.sum(predictions_filtered == 8)
nr_tot_filtered = len(predictions_filtered) - nr_neutral
nr_pos_filtered = np.sum(predictions_filtered == 1)
nr_neg_filtered = np.sum(predictions_filtered == 0)

percentage_pos_filtered = nr_pos_filtered / nr_tot_filtered
percentage_neg_filtered = nr_neg_filtered / nr_tot_filtered

print("Removed {} predictions considered neutral\n".format(nr_neutral))
print("nr_pos:", nr_pos_filtered)
print("nr_neg:", nr_neg_filtered)
print("nr_tot:", nr_tot_filtered)
print("\npercentage_pos_without_neutral:", percentage_pos_filtered)
print("percentage_neg_without_neutral:", percentage_neg_filtered)


# Conclusion

We got a majority of positive tweets about the vaccine. The results are 55% positive and 45% negative before filtering neutral predictions. After filtration the amount of positive predictions increased with 59% positive and 41% negative. 