In [161]:
# Noah Venethongkham, 219660117
# Ashley Thor, 219334909
# Lucas Saechao, 218794239
# CSC 180 - Intelligent Systems

In [162]:
# matplotlib
%matplotlib inline
from matplotlib.pyplot import figure, show
import matplotlib.pyplot as plt

# numpy and pandas
import numpy as np
import pandas as pd
import multiprocessing

# scikit learn
from sklearn import preprocessing
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, log_loss
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.utils import column_or_1d
import sklearn.feature_extraction.text as sk_text
import skimage.transform

# natural language toolkit
# run pip install nltk
from nltk.corpus import stopwords
import nltk

from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors

# tensorflow and keras
from tensorflow.keras import optimizers, regularizers
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.layers import LSTM, Input, Dense, Activation, Flatten, Dropout, Conv1D, Conv2D, GlobalMaxPooling1D, MaxPooling1D, MaxPooling2D, Embedding
#from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# run pip install np_utils
from tensorflow.keras.utils import to_categorical

# python libraries
from collections.abc import Sequence
import requests
import pathlib
import shutil
import string
import json
import time
import csv
import io
import os
import re

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ashle\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [163]:
# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)


# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()
    if sd is None:
        sd = df[name].std()
    df[name] = (df[name] - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column. 
    target_type = df[target].dtypes
    target_type = target_type[0] if isinstance(target_type, Sequence) else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    else:
        # Regression
        return df[result].values.astype(np.float32), df[target].values.astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)


# Regression chart.
def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])
    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
               * (normalized_high - normalized_low) + normalized_low

# Plots a confusion matrix for the model
def plot_confusion_matrix(cm, names, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(names))
    plt.xticks(tick_marks, names, rotation=45)
    plt.yticks(tick_marks, names)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    
# Plot an ROC curve
def plot_roc(pred, y):
    fpr, tpr, thresholds = roc_curve(y, pred)
    roc_area_under_curve = auc(fpr, tpr)
    
    plt.figure()
    plt.plot(fpr, tpr, label='ROC curve (area = $0.2f)' % roc_area_under_curve)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC)')
    plt.legend(loc="lower right")
    plt.show
 
def text_to_word_list(text):
    text = str(text)
    text = text.lower()
    
    # clean text by regex
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=><]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\>", " ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"\'", " ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", "\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"covid19", "covid", text)
    text = re.sub(r"covid-19", "covid", text)
    text = re.sub(r"covid - 19", "covid", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub('<.*?>', '', text)
    text = text.split()
    return text

def remove_stopwords(text):
    stopwords_list = stopwords.words('english')
    whitelist = ["n't", "not", "no"]
    words = text.split()
    words_clean = [word for word in words if (word not in stopwords_list or word in whitelist) and len(word) > 1]
    return " ".join(words_clean)
    
def read_glove_vector(glove_vec):
    with open(glove_vec, 'r', encoding='utf-8') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            w_line = line.split()
            current_word = w_line[0]
            word_to_vec_map[current_word] = np.array(w_line[1:], dtype=np.float64)
    return word_to_vec_map
    
def lstm_model(input_shape):
    x_indices = Input(input_shape)
    embeddings = embedding_layer(x_indices)
    x = LSTM(128, return_sequences=True)(embeddings)
    x = Dropout(0.6)(x)
    x = LSTM(128, return_sequences=True)(x)
    x = Dropout(0.6)(x)
    x = LSTM(128)(x)
    x = Dense(1, activation='sigmoid')(x)
    model = Model(inputs=x_indices, outputs=x)
    return model

def conv_model(input_shape):
    x_indices = Input(input_shape)
    embeddings = embedding_layer(x_indices)
    x = Conv1D(512, 3, activation='relu')(embeddings)
    x = MaxPooling1D(3)(x)
    x = Conv1D(256, 3, activation='relu')(x)
    x = MaxPooling1D(3)(x)
    x = Conv1D(256, 3, activation='relu')(x)
    x = Dropout(0.8)(x)
    x = MaxPooling1D(3)(x)
    x = GlobalMaxPooling1D()(x)
    x = Dense(256, activation='relu')(x)
    x = Dense(1, activation='sigmoid')(x)
    model = Model(inputs=x_indices, outputs=x)
    return model
    
def predict_sentiments(data, corpus):
    data['sentiment score'] = 0
    corpus = pad_sequences(corpus, maxlen=max_len, padding='post')
    pred = model.predict(corpus)
    data['sentiment score'] = pred
    pred_sentiment = np.array(list(map(lambda x: 'positive' if x > 0.5 else 'negative', pred)))
    data['predicted sentiment'] = 0
    data['predicted sentiment'] = pred_sentiment
    return data
    
# Beep if on a windows machine
if os.name == 'nt':
    def ding():
        winsound.Beep(2000, 300)
        winsound.Beep(2000, 300)
        winsound.Beep(2000, 300)

In [164]:
# Create pandas dataframe output file
df_reddit = pd.read_csv('reddit_vm.csv', encoding="utf-8")
df_reddit = df_reddit[['title', 'body', 'sentiment']].fillna('')
df_reddit['title'] = df_reddit['title'].replace(to_replace='Comment', value='')

print(df_reddit)

                                                  title  \
0     Health Canada approves AstraZeneca COVID-19 va...   
1     COVID-19 in Canada: 'Vaccination passports' a ...   
2     Coronavirus variants could fuel Canada's third...   
3     Canadian government to extend COVID-19 emergen...   
4     Canada: Pfizer is 'extremely committed' to mee...   
...                                                 ...   
1486                                                      
1487                                                      
1488                                                      
1489                                                      
1490                                                      

                                                   body sentiment  
0                                                        positive  
1                                                                  
2                                                                  
3                  

In [165]:
df_reddit.body = df_reddit.body.apply(remove_stopwords)
df_reddit.title = df_reddit.title.apply(remove_stopwords)
print(df_reddit)

                                                  title  \
0     Health Canada approves AstraZeneca COVID-19 va...   
1     COVID-19 Canada: 'Vaccination passports' near ...   
2     Coronavirus variants could fuel Canada's third...   
3     Canadian government extend COVID-19 emergency ...   
4     Canada: Pfizer 'extremely committed' meeting v...   
...                                                 ...   
1486                                                      
1487                                                      
1488                                                      
1489                                                      
1490                                                      

                                                   body sentiment  
0                                                        positive  
1                                                                  
2                                                                  
3                  

In [166]:
df_sentiment = df_reddit.sentiment
df_sentiment.replace('', 'negative', inplace=True)
df_sentiment

0       positive
1       negative
2       negative
3       negative
4       positive
          ...   
1486    positive
1487    negative
1488    positive
1489    positive
1490    negative
Name: sentiment, Length: 1491, dtype: object

In [167]:
df_title = df_reddit.title
df_title.replace('', np.nan, inplace=True)
df_title.to_frame(name="text")
df_body = df_reddit.body
df_body.to_frame(name="text")
df_body.replace('', np.nan, inplace=True)

In [168]:
df_title_sentiment = pd.concat([df_title, df_sentiment], axis=1)
df_title_sentiment

Unnamed: 0,title,sentiment
0,Health Canada approves AstraZeneca COVID-19 va...,positive
1,COVID-19 Canada: 'Vaccination passports' near ...,negative
2,Coronavirus variants could fuel Canada's third...,negative
3,Canadian government extend COVID-19 emergency ...,negative
4,Canada: Pfizer 'extremely committed' meeting v...,positive
...,...,...
1486,,positive
1487,,negative
1488,,positive
1489,,positive


In [169]:
df_body_sentiment = pd.concat([df_body, df_sentiment], axis=1)
df_body_sentiment

Unnamed: 0,body,sentiment
0,,positive
1,,negative
2,,negative
3,,negative
4,,positive
...,...,...
1486,The problem calculations idea layperson napkin...,positive
1487,created Vaxfact site using references reliable...,negative
1488,>The information provided not wrong You've rep...,positive
1489,Basically nothing. >Autoimmunity central nervo...,positive


In [170]:
df_title_clean = df_title_sentiment.dropna().drop_duplicates().reset_index(drop=True)
df_title_clean

Unnamed: 0,title,sentiment
0,Health Canada approves AstraZeneca COVID-19 va...,positive
1,COVID-19 Canada: 'Vaccination passports' near ...,negative
2,Coronavirus variants could fuel Canada's third...,negative
3,Canadian government extend COVID-19 emergency ...,negative
4,Canada: Pfizer 'extremely committed' meeting v...,positive
...,...,...
445,father five unvaccinated children. Am unfit pa...,negative
446,Love Them. Protect Them. Never Inject Them.,negative
447,Vaccines Are Just Asping For Trouble,negative
448,Dr. Harper explained presentation cervical can...,negative


In [171]:
df_body_clean = df_body_sentiment.dropna().drop_duplicates().reset_index(drop=True)
df_body_clean

Unnamed: 0,body,sentiment
0,Your OP. It's not myth. Only one vaccine conta...,negative
1,https://youtu.be/zBkVCpbNnkU,positive
2,Because Anti-Vaxxers no sense,positive
3,"What mean ""your OP"". fairly new reddit.",negative
4,"When say there's no thimerasol, mean childhood...",negative
...,...,...
1093,The problem calculations idea layperson napkin...,positive
1094,created Vaxfact site using references reliable...,negative
1095,>The information provided not wrong You've rep...,positive
1096,Basically nothing. >Autoimmunity central nervo...,positive


In [172]:
df_corpus = pd.DataFrame(index=range(0, 1524), columns=['text', 'sentiment'], dtype='object')
df_corpus_text = pd.concat([df_title_clean.title, df_body_clean.body])
df_corpus_sentiment = pd.concat([df_title_clean.sentiment, df_body_clean.sentiment])
df_corpus_text = df_corpus_text.to_frame(name='text')
df_corpus_text.reset_index(inplace=True)
df_corpus_sentiment = df_corpus_sentiment.to_frame(name='sentiment')
df_corpus_sentiment.reset_index(inplace=True)
df_corpus.text = df_corpus_text.text
df_corpus.sentiment = df_corpus_sentiment.sentiment

In [173]:
df_corpus.head()

Unnamed: 0,text,sentiment
0,Health Canada approves AstraZeneca COVID-19 va...,positive
1,COVID-19 Canada: 'Vaccination passports' near ...,negative
2,Coronavirus variants could fuel Canada's third...,negative
3,Canadian government extend COVID-19 emergency ...,negative
4,Canada: Pfizer 'extremely committed' meeting v...,positive


In [174]:
texts = df_corpus.text
sentiments = df_corpus.sentiment

In [175]:
corpus_list = []
for i in range(len(texts)):
    corpus_list.append(texts[i])

In [176]:
y = np.array(list(map(lambda x: 1 if x=="positive" else 0, sentiments)))

In [177]:
y

array([1, 0, 0, ..., 0, 1, 0])

In [178]:
x_train, x_test, y_train, y_test = train_test_split(corpus_list, y, test_size=0.3, random_state=45)

In [179]:
print(len(x_train))
print(len(x_test))
print(len(y_train))
print(len(y_test))

1066
458
1066
458


In [180]:
tokenizer = Tokenizer(num_words=25000)
tokenizer.fit_on_texts(x_train)
word_to_index = tokenizer.word_index

In [181]:
glove_file_loc = os.path.join(
    os.path.expanduser('~'), '.keras/datasets/glove.6B.300d.txt'
)
word_to_vec_map = read_glove_vector(glove_file_loc)

In [182]:
max_len = 150

In [183]:
vocab_len = len(word_to_index)
embed_vector_len = word_to_vec_map['moon'].shape[0]
emb_matrix = np.zeros((vocab_len + 1, embed_vector_len))
print(vocab_len)
print(embed_vector_len)
print(emb_matrix.shape)

6963
300
(6964, 300)


In [184]:
for word, index in word_to_index.items():
    embedding_vec = word_to_vec_map.get(word)
    if embedding_vec is not None:
        emb_matrix[index, :] = embedding_vec

In [185]:
embedding_layer = Embedding(
    input_dim=vocab_len + 1,
    output_dim=embed_vector_len,
    input_length=max_len,
    weights=[emb_matrix],
    trainable=False
)

In [186]:
model = lstm_model((max_len,))
model.summary()

Model: "model_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_9 (InputLayer)         [(None, 150)]             0         
_________________________________________________________________
embedding_4 (Embedding)      (None, 150, 300)          2089200   
_________________________________________________________________
lstm_12 (LSTM)               (None, 150, 128)          219648    
_________________________________________________________________
dropout_12 (Dropout)         (None, 150, 128)          0         
_________________________________________________________________
lstm_13 (LSTM)               (None, 150, 128)          131584    
_________________________________________________________________
dropout_13 (Dropout)         (None, 150, 128)          0         
_________________________________________________________________
lstm_14 (LSTM)               (None, 128)               1315

In [187]:
convolution_model = conv_model((max_len,))
convolution_model.summary()

Model: "model_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_10 (InputLayer)        [(None, 150)]             0         
_________________________________________________________________
embedding_4 (Embedding)      (None, 150, 300)          2089200   
_________________________________________________________________
conv1d_12 (Conv1D)           (None, 148, 512)          461312    
_________________________________________________________________
max_pooling1d_12 (MaxPooling (None, 49, 512)           0         
_________________________________________________________________
conv1d_13 (Conv1D)           (None, 47, 256)           393472    
_________________________________________________________________
max_pooling1d_13 (MaxPooling (None, 15, 256)           0         
_________________________________________________________________
conv1d_14 (Conv1D)           (None, 13, 256)           1968

In [188]:
x_train_indices = tokenizer.texts_to_sequences(x_train)

In [189]:
x_train_indices = pad_sequences(x_train_indices, maxlen=max_len, padding='post')
print(x_train_indices.shape)

(1066, 150)


In [190]:
x_test_indices = tokenizer.texts_to_sequences(x_test)
x_test_indices = pad_sequences(x_test_indices, maxlen=max_len, padding='post')

In [191]:
adam = tf.keras.optimizers.Adam(learning_rate = 0.0001)
convolution_model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])

In [192]:
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-6, patience=10, verbose=2, mode='auto')
checkpoint = ModelCheckpoint(filepath="best_weights_conv1d.hdf5", verbose=0, save_best_only=True)

In [193]:
# Conv1D model
convolution_model.fit(x_train_indices, y_train, batch_size=64, callbacks=[monitor, checkpoint], epochs=15, validation_data=(x_test_indices, y_test))

Train on 1066 samples, validate on 458 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x19217e1e488>

In [194]:
convolution_model.load_weights('best_weights_conv1d.hdf5')

In [195]:
# LSTM model
adam = tf.keras.optimizers.Adam(learning_rate = 0.0001)
model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])

monitor = EarlyStopping(monitor='val_loss', min_delta=1e-6, patience=10, verbose=2, mode='auto')
checkpoint = ModelCheckpoint(filepath="best_weights_lstm.hdf5", verbose=0, save_best_only=True)

model.fit(x_train_indices, y_train, batch_size=64, callbacks=[monitor, checkpoint], epochs=15, validation_data=(x_test_indices, y_test))

model.load_weights('best_weights_lstm.hdf5')

Train on 1066 samples, validate on 458 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15

KeyboardInterrupt: 

In [149]:
# x_test_indices = tokenizer.texts_to_sequences(x_test)
# x_test_indices = pad_sequences(x_test_indices, maxlen=max_len, padding='post')

In [150]:
model.evaluate(x_test_indices, y_test)



[0.4560497041352451, 0.8253275]

In [151]:
convolution_model.evaluate(x_test_indices, y_test)



[0.4360146106070306, 0.8122271]

In [152]:
predictions = convolution_model.predict(x_test_indices)

In [153]:
n = np.random.randint(0, len(x_test))
x_test[n]

'How manufacturing vaccine pollute? There isn’t anything burned'

In [154]:
if predictions[n] > 0.5:
    print('predicted sentiment is positive')
else:
    print('predicted sentiment is negative')
    
if y_test[n] == 1:
    print('correct sentiment is positive')
else:
    print('correct sentiment is negative')

predicted sentiment is negative
correct sentiment is negative


In [155]:
print(predictions[n])
print(y_test[n])

[0.17808822]
0


In [156]:
convolution_model.save_weights('best_weights_conv1d.hdf5')

In [157]:
corpus_tokens = tokenizer.texts_to_sequences(corpus_list)

In [158]:
data = df_corpus
data = predict_sentiments(data, corpus_tokens)

In [159]:
data[['text', 'sentiment', 'sentiment score', 'predicted sentiment']].to_csv('prediction.csv', index=False)

In [160]:
data

Unnamed: 0,text,sentiment,sentiment score,predicted sentiment
0,Health Canada approves AstraZeneca COVID-19 va...,positive,0.160525,negative
1,COVID-19 Canada: 'Vaccination passports' near ...,negative,0.160530,negative
2,Coronavirus variants could fuel Canada's third...,negative,0.160525,negative
3,Canadian government extend COVID-19 emergency ...,negative,0.160525,negative
4,Canada: Pfizer 'extremely committed' meeting v...,positive,0.160523,negative
...,...,...,...,...
1519,"There still 100,000 deaths measles every year ...",negative,0.160531,negative
1520,What qualifies something toxin? What ppb even ...,negative,0.160524,negative
1521,You answer question. You said can't cause dama...,negative,0.160535,negative
1522,"Yeah, long time ago, vaccines weren’t safe tod...",positive,0.160523,negative
