In [67]:
# Noah Venethongkham, 219660117
# Ashley Thor, 219334909
# Lucas Saechao, 218794239
# CSC 180 - Intelligent Systems

In [89]:
# matplotlib
%matplotlib inline
from matplotlib.pyplot import figure, show
import matplotlib.pyplot as plt

# numpy and pandas
import numpy as np
import pandas as pd
import multiprocessing
from unidecode import unidecode

# scikit learn
from sklearn import preprocessing
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, log_loss
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.utils import column_or_1d
import sklearn.feature_extraction.text as sk_text
import skimage.transform

# natural language toolkit
# run pip install nltk
from nltk.corpus import stopwords
import nltk

from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors

# tensorflow and keras
from tensorflow.keras import optimizers, regularizers
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.layers import LSTM, Input, Dense, Activation, Flatten, Dropout, Conv1D, Conv2D, GlobalMaxPooling1D, MaxPooling1D, MaxPooling2D, Embedding
#from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# run pip install np_utils
from tensorflow.keras.utils import to_categorical

# python libraries
from collections.abc import Sequence
import requests
import pathlib
import shutil
import string
import json
import time
import csv
import io
import os
import re

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\19165\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [93]:
# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)


# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()
    if sd is None:
        sd = df[name].std()
    df[name] = (df[name] - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column. 
    target_type = df[target].dtypes
    target_type = target_type[0] if isinstance(target_type, Sequence) else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    else:
        # Regression
        return df[result].values.astype(np.float32), df[target].values.astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)


# Regression chart.
def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])
    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
               * (normalized_high - normalized_low) + normalized_low

# Plots a confusion matrix for the model
def plot_confusion_matrix(cm, names, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(names))
    plt.xticks(tick_marks, names, rotation=45)
    plt.yticks(tick_marks, names)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    
# Plot an ROC curve
def plot_roc(pred, y):
    fpr, tpr, thresholds = roc_curve(y, pred)
    roc_area_under_curve = auc(fpr, tpr)
    
    plt.figure()
    plt.plot(fpr, tpr, label='ROC curve (area = $0.2f)' % roc_area_under_curve)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC)')
    plt.legend(loc="lower right")
    plt.show
 
def text_to_word_list(text, unidec):
    text = unidec(text)
    text = str(text)
    text = text.lower()
    
    # clean text by regex
    text = re.sub(r"[^A-Za-z0-9^,!?.\/'+]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\>", " ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"\'", " ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", "\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"covid19", "covid", text)
    text = re.sub(r"covid-19", "covid", text)
    text = re.sub(r"covid - 19", "covid", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub('<.*?>', '', text)
    text = text.split()
    return text

def remove_stopwords(text):
    stopwords_list = stopwords.words('english')
    whitelist = ["n't", "not", "no"]
    words = text.split()
    words_clean = [word for word in words if (word not in stopwords_list or word in whitelist) and len(word) > 1]
    return " ".join(words_clean)
    
def read_glove_vector(glove_vec):
    with open(glove_vec, 'r', encoding='utf-8') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            w_line = line.split()
            current_word = w_line[0]
            word_to_vec_map[current_word] = np.array(w_line[1:], dtype=np.float64)
    return word_to_vec_map
    
def lstm_model(input_shape):
    x_indices = Input(input_shape)
    embeddings = embedding_layer(x_indices)
    x = LSTM(128, return_sequences=True)(embeddings)
    x = Dropout(0.6)(x)
    x = LSTM(128, return_sequences=True)(x)
    x = Dropout(0.6)(x)
    x = LSTM(128)(x)
    x = Dense(1, activation='sigmoid')(x)
    model = Model(inputs=x_indices, outputs=x)
    return model

def conv_model(input_shape):
    x_indices = Input(input_shape)
    embeddings = embedding_layer(x_indices)
    x = Conv1D(512, 3, activation='relu')(embeddings)
    x = MaxPooling1D(3)(x)
    x = Conv1D(256, 3, activation='relu')(x)
    x = MaxPooling1D(3)(x)
    x = Conv1D(256, 3, activation='relu')(x)
    x = Dropout(0.8)(x)
    x = MaxPooling1D(3)(x)
    x = GlobalMaxPooling1D()(x)
    x = Dense(256, activation='relu')(x)
    x = Dense(1, activation='sigmoid')(x)
    model = Model(inputs=x_indices, outputs=x)
    return model
    
def predict_sentiments(data, corpus):
    data['sentiment score'] = 0
    corpus = pad_sequences(corpus, maxlen=max_len, padding='post')
    pred = model.predict(corpus)
    data['sentiment score'] = pred
    pred_sentiment = np.array(list(map(lambda x: 'positive' if x > 0.5 else 'negative', pred)))
    data['predicted sentiment'] = 0
    data['predicted sentiment'] = pred_sentiment
    return data
    
# Beep if on a windows machine
if os.name == 'nt':
    def ding():
        winsound.Beep(2000, 300)
        winsound.Beep(2000, 300)
        winsound.Beep(2000, 300)

In [94]:
# Create pandas dataframe output file
df_reddit = pd.read_csv('reddit_vm.csv', encoding="utf-8")
df_reddit = df_reddit[['title', 'body', 'sentiment']].fillna('')
df_reddit['title'] = df_reddit['title'].replace(to_replace='Comment', value='')

print(df_reddit)

                                                  title  \
0     Health Canada approves AstraZeneca COVID-19 va...   
1     COVID-19 in Canada: 'Vaccination passports' a ...   
2     Coronavirus variants could fuel Canada's third...   
3     Canadian government to extend COVID-19 emergen...   
4     Canada: Pfizer is 'extremely committed' to mee...   
...                                                 ...   
1486                                                      
1487                                                      
1488                                                      
1489                                                      
1490                                                      

                                                   body sentiment  
0                                                        positive  
1                                                                  
2                                                                  
3                  

In [95]:
df_reddit.body = df_reddit.body.apply(remove_stopwords)
df_reddit.title = df_reddit.title.apply(remove_stopwords)

df_sentiment = df_reddit.sentiment
df_sentiment.replace('positive', 1.0, inplace=True)
df_sentiment.replace('', -1.0, inplace=True)
df_sentiment

0       1.0
1      -1.0
2      -1.0
3      -1.0
4       1.0
       ... 
1486    1.0
1487   -1.0
1488    1.0
1489    1.0
1490   -1.0
Name: sentiment, Length: 1491, dtype: float64

In [96]:
df_title = df_reddit.title
df_title.replace('', np.nan, inplace=True)
df_title.to_frame(name="text")
df_body = df_reddit.body
df_body.to_frame(name="text")
df_body.replace('', np.nan, inplace=True)

In [97]:
df_title_sentiment = pd.concat([df_title, df_sentiment], axis=1)
df_title_sentiment

Unnamed: 0,title,sentiment
0,Health Canada approves AstraZeneca COVID-19 va...,1.0
1,COVID-19 Canada: 'Vaccination passports' near ...,-1.0
2,Coronavirus variants could fuel Canada's third...,-1.0
3,Canadian government extend COVID-19 emergency ...,-1.0
4,Canada: Pfizer 'extremely committed' meeting v...,1.0
...,...,...
1486,,1.0
1487,,-1.0
1488,,1.0
1489,,1.0


In [98]:
df_body_sentiment = pd.concat([df_body, df_sentiment], axis=1)
df_body_sentiment

Unnamed: 0,body,sentiment
0,,1.0
1,,-1.0
2,,-1.0
3,,-1.0
4,,1.0
...,...,...
1486,The problem calculations idea layperson napkin...,1.0
1487,created Vaxfact site using references reliable...,-1.0
1488,>The information provided not wrong You've rep...,1.0
1489,Basically nothing. >Autoimmunity central nervo...,1.0


In [99]:
df_title_clean = df_title_sentiment.dropna().drop_duplicates().reset_index(drop=True)
df_title_clean

Unnamed: 0,title,sentiment
0,Health Canada approves AstraZeneca COVID-19 va...,1.0
1,COVID-19 Canada: 'Vaccination passports' near ...,-1.0
2,Coronavirus variants could fuel Canada's third...,-1.0
3,Canadian government extend COVID-19 emergency ...,-1.0
4,Canada: Pfizer 'extremely committed' meeting v...,1.0
...,...,...
445,father five unvaccinated children. Am unfit pa...,-1.0
446,Love Them. Protect Them. Never Inject Them.,-1.0
447,Vaccines Are Just Asping For Trouble,-1.0
448,Dr. Harper explained presentation cervical can...,-1.0


In [100]:
df_body_clean = df_body_sentiment.dropna().drop_duplicates().reset_index(drop=True)
df_body_clean

Unnamed: 0,body,sentiment
0,Your OP. It's not myth. Only one vaccine conta...,-1.0
1,https://youtu.be/zBkVCpbNnkU,1.0
2,Because Anti-Vaxxers no sense,1.0
3,"What mean ""your OP"". fairly new reddit.",-1.0
4,"When say there's no thimerasol, mean childhood...",-1.0
...,...,...
1093,The problem calculations idea layperson napkin...,1.0
1094,created Vaxfact site using references reliable...,-1.0
1095,>The information provided not wrong You've rep...,1.0
1096,Basically nothing. >Autoimmunity central nervo...,1.0


In [101]:
df_corpus = pd.DataFrame(index=range(0, 1524), columns=['text', 'sentiment'], dtype='object')
df_corpus_text = pd.concat([df_title_clean.title, df_body_clean.body])
df_corpus_sentiment = pd.concat([df_title_clean.sentiment, df_body_clean.sentiment])
df_corpus_text = df_corpus_text.to_frame(name='text')
df_corpus_text.reset_index(inplace=True)
df_corpus_sentiment = df_corpus_sentiment.to_frame(name='sentiment')
df_corpus_sentiment.reset_index(inplace=True)
df_corpus.text = df_corpus_text.text
df_corpus.sentiment = df_corpus_sentiment.sentiment

In [102]:
df_corpus.text = df_corpus.text.apply(lambda x: text_to_word_list(x, unidecode))

In [103]:
df_corpus.head()

Unnamed: 0,text,sentiment
0,"[health, canada, approves, astrazeneca, covid,...",1.0
1,"[covid, 19, canada, vaccination, passports, ne...",-1.0
2,"[coronavirus, variants, could, fuel, canada, t...",-1.0
3,"[canadian, government, extend, covid, 19, emer...",-1.0
4,"[canada, pfizer, extremely, committed, meeting...",1.0


In [104]:
df_corpus.sentiment.value_counts() / len(df_corpus)

-1.0    0.833333
 1.0    0.166667
Name: sentiment, dtype: float64

In [105]:
df_corpus[df_corpus.sentiment==1]

Unnamed: 0,text,sentiment
0,"[health, canada, approves, astrazeneca, covid,...",1.0
4,"[canada, pfizer, extremely, committed, meeting...",1.0
5,"[canada, oxford, astrazeneca, vaccine, approva...",1.0
6,"[fuck, anti, vaxxing, retards]",1.0
8,"[magnetic, therapy, covid, vaccines]",1.0
...,...,...
1513,"[if, point, thimerosal, content, toxic, vaccin...",1.0
1516,"[allow, help, reiterating, person, said, eleme...",1.0
1517,"[same, odds, getting, struck, lightning, would...",1.0
1518,"[she, got, does, not, vaccine, fight, it, and,...",1.0


In [106]:
df_model = df_corpus.copy()
df_model = df_model[df_model.text.str.len() > 1]

In [107]:
sent = [row for row in df_model.text]
phrases = Phrases(sent, min_count=1, progress_per=50000)
bigram = Phraser(phrases)
sentences = bigram[sent]
sentences[0]

['health', 'canada', 'approves', 'astrazeneca', 'covid_19', 'vaccine']

In [108]:
w2v_model = Word2Vec(
    min_count=3,
    window=4,
    vector_size=300,
    sample=1e-5,
    alpha=0.03,
    min_alpha=0.0007,
    negative=20,
    workers=multiprocessing.cpu_count()-1
)

start = time.time()
w2v_model.build_vocab(sentences, progress_per=50000)
print("Time to build vocab: {} mins".format(round((time.time() - start) / 60, 2)))

Time to build vocab: 0.0 mins


In [109]:
start = time.time()
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)
print('Time to train the model: {} mins'.format(round((time.time() - start) / 60, 2)))
w2v_model.init_sims(replace=True)

Time to train the model: 0.02 mins


  after removing the cwd from sys.path.


In [110]:
w2v_model.save("word2vec.model")

In [111]:
file_export = df_model.copy()
file_export['old_text'] = file_export.text
file_export.old_text = file_export.old_text.str.join(' ')
file_export.text = file_export.text.apply(lambda x: ' '.join(bigram[x]))
file_export.sentiment = file_export.sentiment.astype('int8')

In [112]:
file_export[['text', 'sentiment']].to_csv('cleaned.csv', index=False)