In [51]:
# Noah Venethongkham, 219660117
# Ashley Thor, 219334909
# Lucas Saechao, 218794239
# CSC 180 - Intelligent Systems

In [52]:
# matplotlib
%matplotlib inline
from matplotlib.pyplot import figure, show
import matplotlib.pyplot as plt

# numpy and pandas
import numpy as np
import pandas as pd
import multiprocessing

# scikit learn
from sklearn import preprocessing
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, log_loss
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.utils import column_or_1d
import sklearn.feature_extraction.text as sk_text
import skimage.transform

# natural language toolkit
# run pip install nltk
from nltk.corpus import stopwords
import nltk

from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors

# tensorflow and keras
from tensorflow.keras import optimizers, regularizers
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.layers import LSTM, Dense, Activation, Flatten, Dropout, Conv2D, MaxPooling2D, Embedding
#from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# run pip install np_utils
from tensorflow.keras.utils import to_categorical

# python libraries
from collections.abc import Sequence
import requests
import pathlib
import shutil
import string
import json
import time
import csv
import io
import os
import re

# if OS is windows, import chime
if os.name == 'nt':
    import winsound
    
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\19165\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [167]:
# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)


# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()
    if sd is None:
        sd = df[name].std()
    df[name] = (df[name] - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column. 
    target_type = df[target].dtypes
    target_type = target_type[0] if isinstance(target_type, Sequence) else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    else:
        # Regression
        return df[result].values.astype(np.float32), df[target].values.astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)


# Regression chart.
def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])
    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
               * (normalized_high - normalized_low) + normalized_low

# Plots a confusion matrix for the model
def plot_confusion_matrix(cm, names, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(names))
    plt.xticks(tick_marks, names, rotation=45)
    plt.yticks(tick_marks, names)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    
# Plot an ROC curve
def plot_roc(pred, y):
    fpr, tpr, thresholds = roc_curve(y, pred)
    roc_area_under_curve = auc(fpr, tpr)
    
    plt.figure()
    plt.plot(fpr, tpr, label='ROC curve (area = $0.2f)' % roc_area_under_curve)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC)')
    plt.legend(loc="lower right")
    plt.show
 
def text_to_word_list(text):
    text = str(text)
    text = text.lower()
    
    # clean text by regex
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"\'", " ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", "\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"covid19", "covid", text)
    text = re.sub(r"covid-19", "covid", text)
    text = re.sub(r"covid - 19", "covid", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    text = text.split()
    return text

def remove_stopwords(text):
    stopwords_list = stopwords.words('english')
    whitelist = ["n't", "not", "no"]
    words = text.split()
    words_clean = [word for word in words if (word not in stopwords_list or word in whitelist) and len(word) > 1]
    return " ".join(words_clean)
    
# Beep if on a windows machine
if os.name == 'nt':
    def ding():
        winsound.Beep(2000, 300)
        winsound.Beep(2000, 300)
        winsound.Beep(2000, 300)

In [168]:
def deep_model(model, x_train, y_train, x_test, y_test):
    model.compile(
        optimizer='rmsprop',
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    history = model.fit(
        x_train,
        y_train,
        epochs=10,
        batch_size=512,
        validation_data=(x_test, y_test),
        verbose=0
    )
    return history

def eval_metric(history, metric_name):
    metric = history.history[metric_name]
    val_metric = history.history['val_' + metric_name]
    e = range(1, 11)
    
    plt.plot(e, metric, 'bo', label="Train " + metric_name)
    plt.plot(e, val_metric, 'b', label="Validation " + metric_name)
    plt.legend()
    plt.show()
    
def test_model(model, x_train, y_train, x_test, y_test, epoch):
    model.fit(x_train, y_train, epochs=epoch, batch_size=512, verbose=0)
    results = model.evaluate(x_test, y_test)
    return results

In [169]:
# Create pandas dataframe output file
df_reddit = pd.read_csv('reddit_vm.csv', encoding="utf-8")
df_reddit = df_reddit[['title', 'body', 'score']].fillna('')
df_reddit['title'] = df_reddit['title'].replace(to_replace='Comment', value='')

print(df_reddit)

                                                  title  \
0     Health Canada approves AstraZeneca COVID-19 va...   
1     COVID-19 in Canada: 'Vaccination passports' a ...   
2     Coronavirus variants could fuel Canada's third...   
3     Canadian government to extend COVID-19 emergen...   
4     Canada: Pfizer is 'extremely committed' to mee...   
...                                                 ...   
1486                                                      
1487                                                      
1488                                                      
1489                                                      
1490                                                      

                                                   body  score  
0                                                            7  
1                                                            2  
2                                                            6  
3                              

In [170]:
df_reddit.body = df_reddit.body.apply(remove_stopwords)
print(df_reddit)

                                                  title  \
0     Health Canada approves AstraZeneca COVID-19 va...   
1     COVID-19 in Canada: 'Vaccination passports' a ...   
2     Coronavirus variants could fuel Canada's third...   
3     Canadian government to extend COVID-19 emergen...   
4     Canada: Pfizer is 'extremely committed' to mee...   
...                                                 ...   
1486                                                      
1487                                                      
1488                                                      
1489                                                      
1490                                                      

                                                   body  score  
0                                                            7  
1                                                            2  
2                                                            6  
3                              

In [171]:
covid = ['covid', 'virus', 'disease', 'sick', 'sickness', 'flu', 'vaccine']
for w in covid:
    if w in emb_dict.keys():
        print("Found {} in keys".format(w))

NameError: name 'emb_dict' is not defined

In [172]:
df_title = df_reddit.title
df_title.replace('', np.nan, inplace=True)
df_body = df_reddit.body
df_body.replace('', np.nan, inplace=True)
df_title_clean = df_title.dropna().drop_duplicates().reset_index(drop=True)#.rename('title')
df_body_clean = df_body.dropna().drop_duplicates().reset_index(drop=True)#.rename('title')

df_corpus = pd.concat([df_title_clean, df_body_clean])
df_dummy = pd.DataFrame(index=range(0, 1549), columns=['text', 'sentiment'], dtype='object')
df_cor = df_corpus.to_frame(name='text')
df_cor.reset_index(inplace=True)
df_c = df_cor.drop('index', axis=1)
df_dummy.text = df_c.text
df_dummy.replace(np.nan, 0.0, inplace=True)
df = df_dummy
df

Unnamed: 0,text,sentiment
0,Health Canada approves AstraZeneca COVID-19 va...,0.0
1,COVID-19 in Canada: 'Vaccination passports' a ...,0.0
2,Coronavirus variants could fuel Canada's third...,0.0
3,Canadian government to extend COVID-19 emergen...,0.0
4,Canada: Pfizer is 'extremely committed' to mee...,0.0
...,...,...
1544,The problem calculations idea layperson napkin...,0.0
1545,created Vaxfact site using references reliable...,0.0
1546,>The information provided not wrong You've rep...,0.0
1547,Basically nothing. >Autoimmunity central nervo...,0.0


In [173]:
df.text = df.text.apply(lambda x: text_to_word_list(x))

In [174]:
df_model = df.copy()
df_model = df_model[df_model.text.str.len() > 1]

In [175]:
sent = [row for row in df_model.text]
phrases = Phrases(sent, min_count=1, progress_per=50000)
bigram = Phraser(phrases)
sentences = bigram[sent]
sentences[0]

['health', 'canada', 'approves', 'astrazeneca', 'covid_vaccine']

In [176]:
w2v_model = Word2Vec(
    min_count=3,
    window=4,
    vector_size=300,
    sample=1e-5,
    alpha=0.03,
    min_alpha=0.0007,
    negative=20,
    workers=multiprocessing.cpu_count()-1
)

start = time.time()
w2v_model.build_vocab(sentences, progress_per=50000)
print('Time to build vocab: {} mins'.format(round((time.time() - start) / 60, 2)))

Time to build vocab: 0.0 mins


In [177]:
start = time.time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)
print('Time to build vocab: {} mins'.format(round((time.time() - start) / 60, 2)))
w2v_model.init_sims(replace=True)

Time to build vocab: 0.02 mins


  """


In [178]:
w2v_model.save("word2vec.model")

In [179]:
df_model

Unnamed: 0,text,sentiment
0,"[health, canada, approves, astrazeneca, covid,...",0.0
1,"[covid, in, canada, :, vaccination, passports,...",0.0
2,"[coronavirus, variants, could, fuel, canada, t...",0.0
3,"[canadian, government, to, extend, covid, emer...",0.0
4,"[canada, :, pfizer, is, extremely, committed, ...",0.0
...,...,...
1544,"[the, problem, calculations, idea, layperson, ...",0.0
1545,"[created, vaxfact, site, using, references, re...",0.0
1546,"[the, information, provided, not, wrong, you, ...",0.0
1547,"[basically, nothing, autoimmunity, central, ne...",0.0


In [181]:
df_export = df_model.copy()
df_export['old_text'] = df_export.text
df_export.old_text = df_export.old_text.str.join(' ')
df_export.text = df_export.text.apply(lambda x: ' '.join(bigram[x]))
df_export.sentiment = df_export.sentiment.astype('int8')

In [182]:
df_export[['text', 'sentiment']].to_csv('cleaned.csv', index=False)