# Further Data Preprocessing

In [None]:
# Developers: Charles Cutler and Christopher Samson
# Class: CMSC 516 Advanced Natural Language Processing
# The following code was developed for the first programming assignment in the course
# CMSC 516, Advanced Natural Language Processing, at Virginia Commonwealth University
#
# This program was created with the intention of combining what we have learned so far in the course to classify the
# sentiment of Twitter tweets. Sentiment analysis is important when it comes to extracting important information from
# text. Classifying the sentiment of tweets from Twitter specifically is a difficult task due to the informal language
# used. Sentiment classification is determining whether a given sentence or group of sentences reflects a positive
# (think happy or good) sentiment or a negative (think upset or bad) sentiment. The method by which I went about the
# task of sentiment analysis is by using a supervised machine learning and extracting features from a set of manually
# classified tweets.
#
# REMINDER!!! READ ME!!!
# Two options to load data into this file for use in training.
#
# <><><><><><><><><><><><><><><><><><>><><><><><><><><><><><><><><><><><><><><><><><>
#
# OPTION 1: GOOGLE COLAB ( RECOMMENDED )
#
# Step 1) Upload the clean version of the data named "Cleaned_Sentiment140_Data.csv" 
#    and the Glove static word embeddings named "glove.twitter.27B.200d.txt" to you google drive.
#
# Step 2) UNCOMMENT the following two lines to allow google colab to mount your google drive: 
#
# from google.colab import drive ## UNCOMMENT ME, THIS IS ONE LINE!!
# drive.mount('/content/drive') ## UNCOMMENT ME, THIS IS THE OTHER LINE!!
#
# Step 3) USE the follwing two lines in place of a local file path name. 
#   NOTICE: If you upload these files into a subfolder you must change the google drive paths below to match!!!
#
#   glove_file_name = str('/content/drive/MyDrive/glove.twitter.27B.200d.txt') ## CURRENTLY I AM IN THE CODE BELOW!
#   data_file_name = str('/content/drive/MyDrive/Cleaned_Sentiment140_Data.csv') ## CURRENTLY I AM IN THE CODE BELOW
#
# <><><><><><><><><><><><><><><><><><>><><><><><><><><><><><><><><><><><><><><><><><>
#
# OPTION 2: LOCALLY
# 1) Replace the file pathnames in the following two lines with the local file paths to the 
# clean version of the data named "Cleaned_Sentiment140_Data.csv" and the Glove static word 
# embeddings named "glove.twitter.27B.200d.txt":
#
#   glove_file_name = str('/content/drive/MyDrive/glove.twitter.27B.200d.txt')   ## I LOOK LIKE THIS, REPLACE ME BELOW!
#   data_file_name = str('/content/drive/MyDrive/Cleaned_Sentiment140_Data.csv') ## I LOOK LIKE THIS, REPLACE ME BELOW!
#
# <><><><><><><><><><><><><><><><><><>><><><><><><><><><><><><><><><><><><><><><><><>

# If you do not use Google Colab, make sure to install these python libraries.
# Installation instructions can be found at:
#
# https://www.tensorflow.org/install
# https://numpy.org/install/
# https://www.tutorialspoint.com/keras/keras_installation.htm
# https://scikit-learn.org/stable/install.html
#

import tensorflow as tf
from tensorflow import keras
import csv
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras_preprocessing.sequence import pad_sequences
from keras import optimizers
from sklearn.model_selection import train_test_split

# For debugging purposes or incase you need to verify the version of Tensor Flow
print(tf.__version__)

# These are file pathnames, IN GOOGLE DRIVE WHEN USING GOOGLE COLAB, to the clean version of the data named 
# "Cleaned_Sentiment140_Data.csv" and the Glove static word embeddings named "glove.twitter.27B.200d.txt".
#
# Alternatively, these CAN be replaced with local file pathnames to the same files if they are downloaded 
# to your computer and you intend to run this code locally.
glove_file_name = str('/content/drive/MyDrive/glove.twitter.27B.200d.txt')
data_file_name = str('/content/drive/MyDrive/Cleaned_Sentiment140_Data.csv')

# Adjust these hyperparmeters in the case of memory issues or in the case of curiosity
lr = 0.0001 # Learning Rate
bs = 320 # Batch Size
epochs = 10 # Number of Epochs

# These are dependent on the static word embeddings you use to represent the words of a tweet, for example.
#
# The "numberOfEmbeddings" is the number of static word embeddings that appear in you desired word embeddings datafile
#
# The "dimensions" is the number of "features" or numbers in each static word embedding. 
# We used "glove.twitter.27B.200d.txt" which, for each static word embedding, provides numerical vectors of length 200 
# for each word in its datafile.
numberOfEmbeddings = 1193515
dimensions = 200

# Used to build the collection of static word embeddings 
embeddings = np.zeros((numberOfEmbeddings, dimensions))

# Build a hashmap to later convert tweets from lists of words into lists of integers. 
# These integers represent the numerical position in "embeddings_indexes" where the word that was previously
# in the tweet at that position is located. 
# 
# Together "embeddings" and "embeddings_indexes" allow for the neural network to create a numerical matrix for each tweet. 
# The main idea is that this numerical matrix not only represents the words in a tweet but also captures something more that 
# the neural network can learn. 
embeddings_indexes = {}
embeddings_indexes["padding_token"] = 0


tweetVectors = [] 
classifications = []

# Go through all of the static word embeddings in the Glove datafile and 
# extract the word and corresponding numerical static word embedding 
with open(glove_file_name) as glove_200embeddings:

    lines = glove_200embeddings.readlines()
    # At this point we have not seen any words nor their word embeddings
    index = 0
    for line in lines:

        tokenized_data_line = line.split()

        # In the case that a static word embedding does not get read by the computer correctly we skip over it
        if len(tokenized_data_line) < dimensions+1:
            continue

        # Extract the word
        embedding_text = tokenized_data_line[0] 
        
        # Extract the numerical static word embedding and convert it into a float
        embedding_values = np.asarray(tokenized_data_line[1:], dtype=np.float32) 
        
        # Remember where this word is to be placed in the embedding collection
        embeddings_indexes[embedding_text] = index + 1

        # Place the numerical static word embedding for a word in the embedding collection
        embeddings[index+1] = embedding_values

        # Increase the number of words we have seen up to this point
        index += 1

# Open the dataset of tweets and their respective sentiments.
# Once open, go through each tweet and convert it from a list of words into a list of integers. 
# Recall these integers represent the numerical position in "embeddings_indexes" where the word that was previously
# in the tweet at that position is located.
with open(data_file_name) as inputDataFile:

    count = 0 # At this point we have not seen any tweets 
    max_tweet_length = 0

    # Use the built in CSV reader to read from the file containing the tweet data
    csvreader = csv.reader(inputDataFile)

    # Skip over the header of the CSV
    next(csvreader)

    for rowOfData in csvreader:

        count += 1 # Increase the number of tweets we have seen

        # Create an empty list to store the list of integers for tokens that we find in tweets 
        singleVector= []
    
        # Retrieve useful information from each data row
        # This includes the annotated sentiment and the actual text of the tweet
        classifiedSentiment = rowOfData[0]
        tweetText = rowOfData[2]

        # If the tweet is empty or the sentiment is missing skip over the tweet
        if len(tweetText) == 0 or len(classifiedSentiment) == 0:
            continue

        # Tokenize the tweet, that is split the tweet into its individual words or tokens.
        tweetTokens = tweetText.split()
        numberOfTokens = len(tweetTokens)

        # For each token in a tweet:
        for tok in tweetTokens:

            # Check to see if this is the longest tweet we have seen. 
            # This is important because later the neural network needs to have a fixed maximum length 
            # and every tweet needs to be of this length! 
            if max_tweet_length < len(tweetTokens):
                max_tweet_length = len(tweetTokens)

            # Check if the token has a static word embedding, if it does save the position of that word embedding to a list
            # This position is relative to the embeddings collection that was created earlier.
            if tok in embeddings_indexes:
                singleVector.append(embeddings_indexes[tok])
            
        # Account for the possibility that none of the tokens within the tweet had a static word embedding
        if len(singleVector) == 0:
            continue

        # In the original dataset the positive and negative sentiment classes were 0 and 4. 
        # Thus for binary classification we convert labels to proper label values of 0 and 1
        classifiedSentiment = 1 if int( classifiedSentiment ) > 0 else 0
        
        # Store the list of integers and the annotated sentiment to their respective lists
        tweetVectors.append(singleVector)
        classifications.append([classifiedSentiment])

# Since the neural network expects all tweets to be of the same size and this is not naturally true, 
# padding the end of tweets with zeros was necessary. These zeros correspond to a "padding" static word 
# embedding within the "embeddings" colelction.
tweetVectors = pad_sequences(tweetVectors, maxlen=max_tweet_length, padding="post")

# Conver the lists of integers and sentiments into numpy arrays
tweetVectors = np.asarray(tweetVectors,dtype=np.int32)
classifications = np.asarray(classifications,dtype=np.int32)

# For debugging purposes we inlcude a visual check that the shape of the two arrays are correct
print(tweetVectors.shape)
print(classifications.shape)

# Building The Model

In [None]:
print('\nbuilding model')

# Define the Model
# We use a sequential convolutional neural network
model = Sequential()

# We use an embedding layer so that the neural network creates the numerical matrix for each tweet.
# As mentioned above, this is done by using "embeddings" and "embeddings_indexes" to conver the lists of 
# integers into a two dimensional numerical matrix that represents each tweet 
model.add(Embedding(numberOfEmbeddings, dimensions, weights=[embeddings],input_length=max_tweet_length, trainable=False))


model.add(Conv1D(filters=32, kernel_size=3, activation='relu', padding='same'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# An optimizer is selected with the set learning rate, the loss function is selected, and the model is compiled. 
optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# The model is fit and a summary of the structure is printed to the screen for visual inspection.
print('\nfitting model')
print(model.summary())


building model

fitting model
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 49, 200)           238703000 
                                                                 
 conv1d (Conv1D)             (None, 49, 32)            19232     
                                                                 
 max_pooling1d (MaxPooling1D  (None, 24, 32)           0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, 768)               0         
                                                                 
 dense (Dense)               (None, 256)               196864    
                                                                 
 dense_1 (Dense)             (None, 128)               32896     
                         

# Train The Model

In [None]:
# Split the data set into three pieces for use in training and evaluation of the model
# We use an 80% train, 10% development or validation, and 10% test split.
X_train, X_test, y_train, y_test = train_test_split(tweetVectors, classifications, test_size=0.2, random_state=42)
X_test, X_val, y_test, y_val  = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

# For visual inspection the shape of each data set is printed to the screen
print('Training Data Shapes')
print(X_train.shape) 
print(y_train.shape)

print('Validation Data Shapes')
print(X_val.shape) 
print(y_val.shape) 

print('Evaluation Data Shapes')
print(X_test.shape) 
print(y_test.shape) 

# The model is trained using the training data, the selected number of epochs, and batch size. 
# The validation set is used to determine how well the model is performing on data that it has not 
# and will not see for training. 
model.fit(X_train, y_train, epochs=epochs, shuffle=True, verbose=1, batch_size=bs, validation_data=(X_val,y_val))

Training Data Shapes
(1270930, 49)
(1270930, 1)
Validation Data Shapes
(158867, 49)
(158867, 1)
Evaluation Data Shapes
(158866, 49)
(158866, 1)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f722b674390>

# Perform Automated Evaluation on the Test Data Set

In [None]:
# Evaluation of perfomance of the model using the built in functions
print('\nevaluating model')
loss, acc = model.evaluate(X_test, y_test, verbose=0)
print('\nTest Accuracy: %f' % (acc*100))


evaluating model

Test Accuracy: 77.890801


# Perform Manual Evaluation on the Test Data Set

In [None]:
# This code is used to calculate the values of a confusion matrix. With the following four values
true_negative = 0
true_positive = 0
false_negative = 0
false_positive = 0
# precision, recall, and f1 scores can be calculated

X_test_tensor = tf.convert_to_tensor(X_test)
predicted_values = model.call(X_test_tensor)

count = 0

# Go through all of the predictions from the model
for prediction, y_label in zip( predicted_values, y_test ):

  count += 1 # increase the number of predictions we have seen

  # Print the progress in manual evaluation every 1000 samples.
  if (count % 1000) == 0:
    print(count)  
  
  # Since the predictions are between 0 and 1, 
  # If they are are closer to 1, greater than or equal to 0.5, we say it was predicted as a 1
  # Otherwise we say it was predicted as a 0
  prediction = 1 if prediction >= 0.5 else 0

  # If the prediction and true label match, that is the prediction is correct 
  # we count it as a true class prediction for whichever label it was
  if prediction == y_label[0]: 
    if y_label[0] == 0:
      true_negative += 1
    else:
      true_positive += 1
  # Otherwise if the prediction and true label do not match, that is the prediction is incorrect
  # we count it as a false class prediction depending of the label it was supposed to be
  else:
    if prediction == 0:
      false_negative += 1
    else:
      false_positive += 1

# Print the counts of the confusion matrix classes
# These can be used to calculate precision, recall, and f1 scores
print(f'True Negative: {true_negative}')
print(f'True Positive: {true_positive}')
print(f'False Negative: {false_negative}')
print(f'False Positive: {false_positive}')

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
First 75000
True Negative: 28886
True Positive: 29525
False Negative: 7989
False Positive: 8600
