In [1]:
# Noah Venethongkham, 219660117
# Ashley Thor, 219334909
# Lucas Saechao, 218794239
# CSC 180 - Intelligent Systems

In [2]:
# matplotlib
%matplotlib inline
from matplotlib.pyplot import figure, show
import matplotlib.pyplot as plt

# numpy and pandas
import numpy as np
import pandas as pd
import multiprocessing

# scikit learn
from sklearn import preprocessing
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, log_loss
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.utils import column_or_1d
import sklearn.feature_extraction.text as sk_text
import skimage.transform

# natural language toolkit
# run pip install nltk
from nltk.corpus import stopwords
import nltk

from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors

# tensorflow and keras
from tensorflow.keras import optimizers, regularizers
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.layers import LSTM, Input, Dense, Activation, Flatten, Dropout, Conv1D, Conv2D, GlobalMaxPooling1D, MaxPooling1D, MaxPooling2D, Embedding
#from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# run pip install np_utils
from tensorflow.keras.utils import to_categorical

# python libraries
from collections.abc import Sequence
import requests
import pathlib
import shutil
import string
import json
import time
import csv
import io
import os
import re

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lucassaechao/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [44]:
# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)


# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()
    if sd is None:
        sd = df[name].std()
    df[name] = (df[name] - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column. 
    target_type = df[target].dtypes
    target_type = target_type[0] if isinstance(target_type, Sequence) else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    else:
        # Regression
        return df[result].values.astype(np.float32), df[target].values.astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)


# Regression chart.
def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])
    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
               * (normalized_high - normalized_low) + normalized_low

# Plots a confusion matrix for the model
def plot_confusion_matrix(cm, names, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(names))
    plt.xticks(tick_marks, names, rotation=45)
    plt.yticks(tick_marks, names)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    
# Plot an ROC curve
def plot_roc(pred, y):
    fpr, tpr, thresholds = roc_curve(y, pred)
    roc_area_under_curve = auc(fpr, tpr)
    
    plt.figure()
    plt.plot(fpr, tpr, label='ROC curve (area = $0.2f)' % roc_area_under_curve)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC)')
    plt.legend(loc="lower right")
    plt.show
 
def text_to_word_list(text):
    text = str(text)
    text = text.lower()
    text = remove_stopwords(text)
    
    # clean text by regex
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', text)
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=><]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\>", " ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"\'", " ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", "\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"covid19", "covid", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub('_', ' ', text)
    return text

def remove_stopwords(text):
    stopwords_list = stopwords.words('english')
    whitelist = ["n't", "not", "no"]
    words = text.split()
    words_clean = [word for word in words if (word not in stopwords_list or word in whitelist) and len(word) > 1]
    return " ".join(words_clean)
    
def remove_stop_manual(data):
    stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", 
             "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during",
             "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", 
             "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into",
             "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or",
             "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", 
             "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's",
             "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up",
             "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's",
             "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've",
             "your", "yours", "yourself", "yourselves" ]
    data = data.apply(lambda x : ' '.join([word for word in x.split() if word not in (stopwords)]))
    return data
    
def read_glove_vector(glove_vec):
    with open(glove_vec, 'r', encoding='utf-8') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            w_line = line.split()
            current_word = w_line[0]
            word_to_vec_map[current_word] = np.array(w_line[1:], dtype=np.float64)
    return word_to_vec_map
    
def lstm_model(input_shape):
    x_indices = Input(input_shape)
    embeddings = embedding_layer(x_indices)
    x = LSTM(128, return_sequences=True)(embeddings)
    x = Dropout(0.2)(x)
    x = LSTM(128, return_sequences=True)(x)
    x = Dropout(0.2)(x)
    x = LSTM(128)(x)
    x = Dense(1, activation='sigmoid')(x)
    model = Model(inputs=x_indices, outputs=x)
    return model

def conv_model(input_shape):
    x_indices = Input(input_shape)
    embeddings = embedding_layer(x_indices)
    x = Conv1D(512, 3, activation='relu')(embeddings)
    x = MaxPooling1D(3)(x)
    x = Conv1D(256, 3, activation='relu')(x)
    x = MaxPooling1D(3)(x)
    x = Conv1D(256, 3, activation='relu')(x)
    x = Dropout(0.2)(x)
    x = MaxPooling1D(3)(x)
    x = GlobalMaxPooling1D()(x)
    x = Dense(256, activation='relu')(x)
    x = Dense(1, activation='sigmoid')(x)
    model = Model(inputs=x_indices, outputs=x)
    return model
    
def predict_sentiments(data, corpus):
    data['sentiment score'] = 0
    corpus = pad_sequences(corpus, maxlen=max_len, padding='post')
    pred = model.predict(corpus)
    data['sentiment score'] = pred
    pred_sentiment = np.array(list(map(lambda x: 'positive' if x > 0.5 else 'negative', pred)))
    data['predicted sentiment'] = 0
    data['predicted sentiment'] = pred_sentiment
    return data
    
# Beep if on a windows machine
if os.name == 'nt':
    def ding():
        winsound.Beep(2000, 300)
        winsound.Beep(2000, 300)
        winsound.Beep(2000, 300)

In [4]:
# Create pandas dataframe output file
df_reddit = pd.read_csv('reddit_vm_clean.csv', encoding="utf-8")
df_reddit = df_reddit[['text', 'sentiment']].fillna('')

print(df_reddit)

                                                   text  sentiment
0     0 125 0 82 mg aluminium vaccines applejuice 10...          1
1     1 autism not just genetic behavioural diagnost...          1
2     well many many scientists believe vaccines saf...         -1
3     1 freedom of 2 looking freedom information act...         -1
4     2 never said andrew wakefield jailed lose medi...         -1
...                                                 ...        ...
1318                      youre idiot gave exact answer         -1
1319                youre right also using food example         -1
1320  zerg admit vaccine contains thimerosal contain...         -1
1321  zero links autism charts argument debunked due...          1
1322               zika virus problem vaccine obviously         -1

[1323 rows x 2 columns]


In [5]:
df_sentiment = df_reddit.sentiment
df_sentiment.replace(-1, 'negative', inplace=True)
df_sentiment.replace(1, 'positive', inplace=True)
df_reddit.sentiment = df_sentiment
df_reddit

Unnamed: 0,text,sentiment
0,0 125 0 82 mg aluminium vaccines applejuice 10...,positive
1,1 autism not just genetic behavioural diagnost...,positive
2,well many many scientists believe vaccines saf...,negative
3,1 freedom of 2 looking freedom information act...,negative
4,2 never said andrew wakefield jailed lose medi...,negative
...,...,...
1318,youre idiot gave exact answer,negative
1319,youre right also using food example,negative
1320,zerg admit vaccine contains thimerosal contain...,negative
1321,zero links autism charts argument debunked due...,positive


In [6]:
df_corpus = df_reddit.sample(frac=1).reset_index(drop=True)

In [7]:
df_corpus.head()

Unnamed: 0,text,sentiment
0,next time try not come things place bad faith ...,negative
1,use youtube video try prove polio vaccine bad ...,negative
2,irrefutable prufe,negative
3,that bit crazy opinion would making illegal no...,negative
4,sure am know mercury vaccines doctors told mil...,negative


In [8]:
df_corpus.text = df_corpus.text.apply(lambda x: text_to_word_list(x))

In [9]:
texts = df_corpus.text
sentiments = df_corpus.sentiment

In [10]:
corpus_list = []
for i in range(len(texts)):
    corpus_list.append(texts[i])

In [11]:
y = np.array(list(map(lambda x: 1 if x=="positive" else 0, sentiments)))

In [12]:
y

array([0, 0, 0, ..., 0, 0, 0])

In [13]:
x_train, x_test, y_train, y_test = train_test_split(corpus_list, y, test_size=0.3, random_state=45)

In [14]:
print(len(x_train))
print(len(x_test))
print(len(y_train))
print(len(y_test))

926
397
926
397


In [15]:
tokenizer = Tokenizer(num_words=25000)
tokenizer.fit_on_texts(x_train)
word_to_index = tokenizer.word_index

In [16]:
glove_file_loc = os.path.join(
    os.path.expanduser('~'), '.keras/datasets/glove.6B.300d.txt'
)
word_to_vec_map = read_glove_vector(glove_file_loc)

In [17]:
max_len = 150

In [18]:
vocab_len = len(word_to_index)
embed_vector_len = word_to_vec_map['moon'].shape[0]
emb_matrix = np.zeros((vocab_len + 1, embed_vector_len))
print(vocab_len)
print(embed_vector_len)
print(emb_matrix.shape)

6256
300
(6257, 300)


In [19]:
for word, index in word_to_index.items():
    embedding_vec = word_to_vec_map.get(word)
    if embedding_vec is not None:
        emb_matrix[index, :] = embedding_vec

In [20]:
embedding_layer = Embedding(
    input_dim=vocab_len + 1,
    output_dim=embed_vector_len,
    input_length=max_len,
    weights=[emb_matrix],
    trainable=False
)

In [21]:
model = lstm_model((max_len,))
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 150)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 150, 300)          1877100   
_________________________________________________________________
lstm (LSTM)                  (None, 150, 128)          219648    
_________________________________________________________________
dropout (Dropout)            (None, 150, 128)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 150, 128)          131584    
_________________________________________________________________
dropout_1 (Dropout)          (None, 150, 128)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               131584

In [22]:
convolution_model = conv_model((max_len,))
convolution_model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 150)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 150, 300)          1877100   
_________________________________________________________________
conv1d (Conv1D)              (None, 148, 512)          461312    
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 49, 512)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 47, 256)           393472    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 15, 256)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 13, 256)           1968

In [23]:
x_train_indices = tokenizer.texts_to_sequences(x_train)

In [24]:
x_train_indices = pad_sequences(x_train_indices, maxlen=max_len, padding='post')
print(x_train_indices.shape)

(926, 150)


In [25]:
x_test_indices = tokenizer.texts_to_sequences(x_test)
x_test_indices = pad_sequences(x_test_indices, maxlen=max_len, padding='post')

In [26]:
adam = tf.keras.optimizers.Adam(learning_rate = 0.0001)
convolution_model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])

In [27]:
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-6, patience=10, verbose=2, mode='auto')
checkpoint = ModelCheckpoint(filepath="best_weights_conv1d.hdf5", verbose=0, save_best_only=True)

In [28]:
# Conv1D model
convolution_model.fit(x_train_indices, y_train, batch_size=64, callbacks=[monitor, checkpoint], epochs=15, validation_data=(x_test_indices, y_test))

Train on 926 samples, validate on 397 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7f8b4d315650>

In [29]:
convolution_model.load_weights('best_weights_conv1d.hdf5')

In [30]:
# LSTM model
adam = tf.keras.optimizers.Adam(learning_rate = 0.0001)
model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])

monitor = EarlyStopping(monitor='val_loss', min_delta=1e-6, patience=10, verbose=2, mode='auto')
checkpoint = ModelCheckpoint(filepath="best_weights_lstm.hdf5", verbose=0, save_best_only=True)

model.fit(x_train_indices, y_train, batch_size=64, callbacks=[monitor, checkpoint], epochs=15, validation_data=(x_test_indices, y_test))

model.load_weights('best_weights_lstm.hdf5')

Train on 926 samples, validate on 397 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [31]:
# x_test_indices = tokenizer.texts_to_sequences(x_test)
# x_test_indices = pad_sequences(x_test_indices, maxlen=max_len, padding='post')

In [32]:
model.evaluate(x_test_indices, y_test)
convolution_model.load_weights('best_weights_conv1d.hdf5')



[0.4518824285764238, 0.8312343]

In [33]:
convolution_model.evaluate(x_test_indices, y_test)



[0.4713668963620885, 0.8287154]

In [None]:
predictions = model.predict(x_test_indices)

In [35]:
n = np.random.randint(0, len(x_test))
x_test[n]

'never win debate antivaxxer knowing good information makes worse throw shit declare victory'

In [36]:
if predictions[n] > 0.5:
    print('predicted sentiment is positive')
else:
    print('predicted sentiment is negative')
    
if y_test[n] == 1:
    print('correct sentiment is positive')
else:
    print('correct sentiment is negative')

predicted sentiment is negative
correct sentiment is positive


In [37]:
print(predictions[n])
print(y_test[n])

[0.08736879]
1


In [38]:
convolution_model.save_weights('best_weights_conv1d.hdf5')

In [39]:
corpus_tokens = tokenizer.texts_to_sequences(corpus_list)

In [40]:
data = df_corpus
data = predict_sentiments(data, corpus_tokens)

In [43]:
data[['text', 'sentiment', 'sentiment score', 'predicted sentiment']].to_csv('clean_model_prediction.csv', index=False)

In [42]:
data

Unnamed: 0,text,sentiment,sentiment score,predicted sentiment
0,next time try not come things place bad faith ...,negative,0.043835,negative
1,use youtube video try prove polio vaccine bad ...,negative,0.133437,negative
2,irrefutable prufe,negative,0.307236,negative
3,bit crazy opinion would making illegal not was...,negative,0.108947,negative
4,sure know mercury vaccines doctors told millio...,negative,0.126902,negative
...,...,...,...,...
1318,mean not pic well son bitch going compliment s...,negative,0.141321,negative
1319,answer yes yes,negative,0.193283,negative
1320,girl work two strokes getting flue shot one el...,negative,0.140401,negative
1321,ok makes sense never chicken pox pretty sure g...,negative,0.181021,negative
