In [None]:
import nltk 
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords

import pandas as pd
import numpy as np
import re
import csv
import random


import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import median_absolute_error as mae
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import accuracy_score as acc
from sklearn.metrics import precision_recall_curve

import matplotlib.pyplot as plt
from scipy.stats import norm

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def std_data(X, mean, std):
    #  Standardizes the data (except for the last column of course) using the training data
    sX = (X - mean)/std
    return sX

def Add_Bias(X):
    # Add in Bias
    new_X = []
    for arr in  X:
        new_X.append(np.append([1], arr))
    new_X = np.array(new_X)
    return new_X

def read_data(input_file):
    with open(input_file) as csvfile:
        readCSV = csv.reader(csvfile)
        data = list(readCSV)
    datanp = np.array(data[1:])
    data = []
    for i, row in enumerate(datanp):
      c = []
      for j, col in enumerate(row):
        if col != '':
          c.append(float(col))
      if c != []:
        data.append(c)
    data = np.array(data)
    label = data[:,-1]
    data = data[:, 0:-1]
    return np.array(data), np.array(label)

def Classifier(Y_old):
    Y_new = []
    for y in Y_old:
        if y < 0.5:
            Y_new.append(0)
        else:
            Y_new.append(1)
    return np.array(Y_new)

def precision(tp, fp):
    print(tp, fp)
    return tp/(tp+fp)

def recall(tp, fn):
    return tp/(tp+fn)

def f_measure(precision, recall):
    return (2*precision*recall)/(precision+recall)

def accuracy(tp, tn, overall):
    return (tp+tn)/overall

def Class_stats(tp_count, tn_count, fp_count, fn_count):
    Precision = precision(tp_count, fp_count)
    Recall = recall(tp_count, fn_count)
    F_measure = f_measure(Precision, Recall)
    Accuracy = accuracy(tp_count, tn_count, tp_count + tn_count + fp_count + fn_count)
    print("\nPrecision: ", Precision)
    print("\nRecall: ", Recall)
    print("\nf_measure: ", F_measure)
    print("\nAccuracy: ", Accuracy)
    
def LRSC(X, Y):
    np.seterr(divide='ignore', invalid='ignore')
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=1/3, random_state=0)

    Y_train = Y_train.reshape(len(Y_train),1)
    Y_test = Y_test.reshape(len(Y_test),1)
    
    mean = np.mean(X_train, axis=0)
    std = np.std(X_train, axis=0, ddof=1)
    X_train = std_data(X_train, mean, std)
    
    # Add in Bias
    X_train = Add_Bias(X_train)
    X_train[np.where(np.isnan(X_train))] = 0
    
     # Standardize Data
    X_test = std_data(X_test, mean, std)
    
    # Add in Bias
    X_test = Add_Bias(X_test)
    
    N = len(X_train)
    D = len(X_train[0])

    thetas = []
    random.seed(0)
    for j in range(1, D+1):
        thetas.append([random.uniform(-1, 1)])
    thetas = np.array(thetas)
    learn_rate = 0.1
    
    # Applies the solution to the training samples
    Y_new = np.dot(X_train, thetas)

    # Cost Function
    sig = 1/(1 + np.exp(-Y_new))
    cost = (1/N)*((-Y_train.T @ np.log(sig +np.finfo(float).eps)) - (1 - Y_train).T @ np.log(1 - sig +np.finfo(float).eps))

    cost = -1
    new_cost = 0
    iterate_count = 1
    cost_array = []
    while iterate_count <= 150000 and abs(new_cost - cost) >= (2**(-23)):
        cost = new_cost
        thetas = thetas - ((learn_rate/N) * np.dot(X_train.T, (sig - Y_train)))
        Y_new = np.dot(X_train, thetas)
 
        # Cost Function
        sig = np.array(1/(1 + np.exp(-Y_new)))
        new_cost = (1/N)*((-Y_train.T @ np.log(sig +np.finfo(float).eps)) - (1 - Y_train).T @ np.log(1 - sig +np.finfo(float).eps))
        
        new_cost = new_cost[0]
        iterate_count += 1

    Y_new = np.dot(X_test, thetas)
    sig = np.array(1/(1 + np.exp(-Y_new)))

    Y_new = Classifier(Y_new)

    fp_count = 0
    tp_count = 0
    fn_count = 0
    tn_count = 0
    for i, y in enumerate(Y_test):
        if y == 1:
            if y == Y_new[i]:
                # true positive
                tp_count += 1
            else:
                # false negative
                fn_count += 1
        else:
            if y == Y_new[i]:
                # true negative
                tn_count += 1
            else:
                # false positive
                fp_count += 1

    print("\nTrue Positives: ", tp_count)
    print("True Negatives: ", tn_count)
    print("False Positives: ", fp_count)
    print("False Negatives: ", fn_count)
    Class_stats(tp_count, tn_count, fp_count, fn_count)

In [None]:
def Normal_Model(input_class):
    mean = np.mean(input_class, axis=0)
    std = np.std(input_class, axis=0, ddof=1)
    return mean, std

def Split_Spam_Nonspam(X_train, Y_train):
    spam_x = []
    nonspam_x = []
    for i, y in enumerate(Y_train):
        if y == 0:
            nonspam_x.append(X_train[i])
        elif y == 1:
            spam_x.append(X_train[i])
    spam_x = np.array(spam_x)
    nonspam_x = np.array(nonspam_x)
    return spam_x, nonspam_x

def NBC(X,Y):
    np.seterr(divide='ignore', invalid='ignore')
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=1/3, random_state=0)

    Y_train = Y_train.reshape(len(Y_train),1)
    Y_test = Y_test.reshape(len(Y_test),1)
    
    mean = np.mean(X_train, axis=0)
    std = np.std(X_train, axis=0, ddof=1)
    
    # Standardize Data
    X_train = std_data(X_train, mean, std)
    
    # Add in Bias  
    X_train = Add_Bias(X_train)
    
    # Standardize Data
    X_test = std_data(X_test, mean, std)
    
    # Add in Bias  
    X_test = Add_Bias(X_test)
    
    spam_x, nonspam_x = Split_Spam_Nonspam(X_train, Y_train)

    spam_mean, spam_std = Normal_Model(spam_x)
    spam_normal_models = norm.pdf(X_test, spam_mean, spam_std)
    spam_cum_prob = []
    where_nan = np.isnan(spam_normal_models)
    spam_normal_models[where_nan] = 0.00000000000001
    
    spam_prior = len(spam_x)/len(X_train)
    for row in spam_normal_models:
        spam_cum_prob.append(np.prod(row)*spam_prior)
    
    nonspam_mean, nonspam_std = Normal_Model(nonspam_x)
    nonspam_normal_models = norm.pdf(X_test, nonspam_mean, nonspam_std)
    nonspam_cum_prob = []
    where_nan = np.isnan(nonspam_normal_models)
    nonspam_normal_models[where_nan] = 0.00000000000001
    
    nonspam_prior = len(nonspam_x)/len(X_train)
    for row in nonspam_normal_models:
        nonspam_cum_prob.append(np.prod(row)*nonspam_prior)

    Y_new = []
    for i, n in enumerate(spam_cum_prob):
        if n >= nonspam_cum_prob[i]:
            Y_new.append(1)
        else:
            Y_new.append(0) 
    Y_new = np.array(Y_new)
    
    fp_count = 0
    tp_count = 0
    fn_count = 0
    tn_count = 0
    
    Y_new = np.array(Y_new)
    for i, y in enumerate(Y_test):
        if y == 1:
            if y == Y_new[i]:
                # true positive
                tp_count += 1
            else:
                # false negative
                fn_count += 1
        else:
            if y == Y_new[i]:
                # true negative
                tn_count += 1
            else:
                # false positive
                fp_count += 1
    print("\nTrue Positives: ", tp_count)
    print("True Negatives: ", tn_count)
    print("False Positives: ", fp_count)
    print("False Negatives: ", fn_count)
    Class_stats(tp_count, tn_count, fp_count, fn_count) 

In [None]:
# Quarterly Report Prediction Stock Trends
print("************ Quarterly Report Prediction Stock Trends ************\n")

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
import sklearn.metrics as metrics
from sklearn.metrics import precision_recall_fscore_support
import sklearn.model_selection as cross_val


X, Y = read_data('AAPL_STOCK.csv')
print("Logistic Regression: \n")
# LRSC(X, Y)
predicted = cross_val.cross_val_predict(LogisticRegression(), X, Y, cv=6)
print("Accuracy: ", metrics.accuracy_score(Y, predicted))

precision, recall, f1, support = precision_recall_fscore_support(Y, predicted, 
                                                          average="weighted")
print("Average Precision: ", precision)
print("Average Recall: ", recall)
print("Average F_Score: ", f1)
print(metrics.classification_report(Y, predicted))

print("_____________________________________________________________")
print("\nNaive Bayes: \n")
# NBC(X, Y)
predicted = cross_val.cross_val_predict(GaussianNB(), X, Y, cv=6)
print("Accuracy: ", metrics.accuracy_score(Y, predicted))
precision, recall, f1, support = precision_recall_fscore_support(Y, predicted, 
                                                          average="weighted")
print("Average Precision: ", precision)
print("Average Recall: ", recall)
print("Average F_Score: ", f1)
print()
print(metrics.classification_report(Y, predicted))

************ Quarterly Report Prediction Stock Trends ************

Logistic Regression: 

Accuracy:  0.6818181818181818
Average Precision:  0.6267942583732057
Average Recall:  0.6818181818181818
Average F_Score:  0.6424242424242425
              precision    recall  f1-score   support

         0.0       0.33      0.17      0.22         6
         1.0       0.74      0.88      0.80        16

    accuracy                           0.68        22
   macro avg       0.54      0.52      0.51        22
weighted avg       0.63      0.68      0.64        22

_____________________________________________________________

Naive Bayes: 

Accuracy:  0.5454545454545454
Average Precision:  0.5876623376623377
Average Recall:  0.5454545454545454
Average F_Score:  0.5627705627705627

              precision    recall  f1-score   support

         0.0       0.25      0.33      0.29         6
         1.0       0.71      0.62      0.67        16

    accuracy                           0.55        22
 

In [None]:
# Citation: https://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python

contractions = { 
"ain't": "am not / are not / is not / has not / have not",
"aren't": "are not / am not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he had / he would",
"he'd've": "he would have",
"he'll": "he shall / he will",
"he'll've": "he shall have / he will have",
"he's": "he has / he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how has / how is / how does",
"I'd": "I had / I would",
"I'd've": "I would have",
"I'll": "I shall / I will",
"I'll've": "I shall have / I will have",
"I'm": "I am",
"I've": "I have",
"isn't": "is not",
"it'd": "it had / it would",
"it'd've": "it would have",
"it'll": "it shall / it will",
"it'll've": "it shall have / it will have",
"it's": "it has / it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she had / she would",
"she'd've": "she would have",
"she'll": "she shall / she will",
"she'll've": "she shall have / she will have",
"she's": "she has / she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as / so is",
"that'd": "that would / that had",
"that'd've": "that would have",
"that's": "that has / that is",
"there'd": "there had / there would",
"there'd've": "there would have",
"there's": "there has / there is",
"they'd": "they had / they would",
"they'd've": "they would have",
"they'll": "they shall / they will",
"they'll've": "they shall have / they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we had / we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what shall / what will",
"what'll've": "what shall have / what will have",
"what're": "what are",
"what's": "what has / what is",
"what've": "what have",
"when's": "when has / when is",
"when've": "when have",
"where'd": "where did",
"where's": "where has / where is",
"where've": "where have",
"who'll": "who shall / who will",
"who'll've": "who shall have / who will have",
"who's": "who has / who is",
"who've": "who have",
"why's": "why has / why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you had / you would",
"you'd've": "you would have",
"you'll": "you shall / you will",
"you'll've": "you shall have / you will have",
"you're": "you are",
"you've": "you have"
}

In [None]:
# Citation: https://medium.com/@Currie32/predicting-the-stock-market-with-the-news-and-deep-learning-7fc8f5f639bc
# https://github.com/Currie32/Predicting-the-Dow-Jones-with-Headlines/blob/master/Predict_Dow_with_News.ipynb


def clean_text(text, remove_stopwords = True):
    
    text = text.lower()
    
    # Replace contractions with their longer forms 
    if True:
        text = text.split()
        new_text = []
        for word in text:
            if word in contractions:
                new_text.append(contractions[word])
            else:
                new_text.append(word)
        text = " ".join(new_text)
    
    # Format words and remove unwanted characters
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'0,0', '00', text) 
    text = re.sub(r'[_"\-;%()|.,+&=*%.,!?:#@\[\]]', ' ', text)
    text = re.sub(r'\'', ' ', text)
    text = re.sub(r'\$', ' $ ', text)
    text = re.sub(r'u s ', ' united states ', text)
    text = re.sub(r'u n ', ' united nations ', text)
    text = re.sub(r'u k ', ' united kingdom ', text)
    text = re.sub(r'j k ', ' jk ', text)
    text = re.sub(r' s ', ' ', text)
    text = re.sub(r' yr ', ' year ', text)
    text = re.sub(r' l g b t ', ' lgbt ', text)
    text = re.sub(r'0km ', '0 km ', text)
    
    # Optionally, remove stop words
    if remove_stopwords:
        text = text.split()
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)
    return text


with open("Combined_News_DJIA.csv") as csvfile:
  readCSV = csv.reader(csvfile)
  f = list(readCSV)
  size = len(f)
  data = []
  Y = []
  for i, row in enumerate(f):
    if i != 0:
      c = []
      for j, col in enumerate(row):
        if j == 1:
          Y.append([int(col)])
        elif j != 0:
          c.append(col)
      data.append(c)
  Y = np.array(Y)
  headlines = np.array(data, dtype=object)

# Clean the headlines
clean_headlines = []

for daily_headlines in headlines:
    clean_daily_headlines = []
    for headline in daily_headlines:
        clean_daily_headlines.append(clean_text(headline))
    clean_headlines.append(clean_daily_headlines)

# Take a look at some headlines to ensure everything was cleaned well
clean_headlines[0]

# Find the number of times each word was used and the size of the vocabulary
word_counts = {}

for date in clean_headlines:
    for headline in date:
        for word in headline.split():
            if word not in word_counts:
                word_counts[word] = 1
            else:
                word_counts[word] += 1
            
print("Size of Vocabulary:", len(word_counts))

# Load GloVe's embeddings
embeddings_index = {}
with open('glove.6B.50d.txt', encoding='utf-8') as f:
  for line in f:
      values = line.split(' ')
      word = values[0]
      embedding = np.asarray(values[1:], dtype='float32')
      embeddings_index[word] = embedding

print('Word embeddings:', len(embeddings_index))

# Find the number of words that are missing from GloVe, and are used more than our threshold.
missing_words = 0
threshold = 10

for word, count in word_counts.items():
    if count > threshold:
        if word not in embeddings_index:
            missing_words += 1
            
missing_ratio = round(missing_words/len(word_counts),4)*100
            
print("Number of words missing from GloVe:", missing_words)
print("Percent of words that are missing from vocabulary: {}%".format(missing_ratio))

# Limit the vocab that we will use to words that appear ≥ threshold or are in GloVe

#dictionary to convert words to integers
vocab_to_int = {} 

value = 0
for word, count in word_counts.items():
    if count >= threshold or word in embeddings_index:
        vocab_to_int[word] = value
        value += 1

# Special tokens that will be added to our vocab
codes = ["<UNK>","<PAD>"]   

# Add codes to vocab
for code in codes:
    vocab_to_int[code] = len(vocab_to_int)

# Dictionary to convert integers to words
int_to_vocab = {}
for word, value in vocab_to_int.items():
    int_to_vocab[value] = word

usage_ratio = round(len(vocab_to_int) / len(word_counts),4)*100

print("Total Number of Unique Words:", len(word_counts))
print("Number of Words we will use:", len(vocab_to_int))
print("Percent of Words we will use: {}%".format(usage_ratio))

# Need to use 50 for embedding dimensions to match GloVe's vectors.
embedding_dim = 50

nb_words = len(vocab_to_int)
# Create matrix with default values of zero
word_embedding_matrix = np.zeros((nb_words, embedding_dim))
for word, i in vocab_to_int.items():
    if word in embeddings_index:
        word_embedding_matrix[i] = embeddings_index[word]
    else:
        # If word not in GloVe, create a random embedding for it
        new_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim))
        embeddings_index[word] = new_embedding
        word_embedding_matrix[i] = new_embedding

# Check if value matches len(vocab_to_int)
print(len(word_embedding_matrix))

# Change the text from words to integers
# If word is not in vocab, replace it with <UNK> (unknown)
word_count = 0
unk_count = 0

int_headlines = []

for date in clean_headlines:
    int_daily_headlines = []
    for headline in date:
        int_headline = []
        for word in headline.split():
            word_count += 1
            if word in vocab_to_int:
                int_headline.append(vocab_to_int[word])
            else:
                int_headline.append(vocab_to_int["<UNK>"])
                unk_count += 1
        int_daily_headlines.append(int_headline)
    int_headlines.append(int_daily_headlines)

unk_percent = round(unk_count/word_count,4)*100

print("Total number of words in headlines:", word_count)
print("Total number of UNKs in headlines:", unk_count)
print("Percent of words that are UNK: {}%".format(unk_percent))

# Find the length of headlines
lengths = []
for date in int_headlines:
    for headline in date:
        lengths.append(len(headline))

# Create a dataframe so that the values can be inspected
lengths = pd.DataFrame(lengths, columns=['counts'])

lengths.describe()

# Limit the length of a day's news to 200 words, and the length of any headline to 16 words.
# These values are chosen to not have an excessively long training time and 
# balance the number of headlines used and the number of words from each headline.
max_headline_length = 16
max_daily_length = 200
pad_headlines = []

for date in int_headlines:
    pad_daily_headlines = []
    for headline in date:
        # Add headline if it is less than max length
        if len(headline) <= max_headline_length:
            for word in headline:
                pad_daily_headlines.append(word)
        # Limit headline if it is more than max length  
        else:
            headline = headline[:max_headline_length]
            for word in headline:
                pad_daily_headlines.append(word)
    
    # Pad daily_headlines if they are less than max length
    if len(pad_daily_headlines) < max_daily_length:
        for i in range(max_daily_length-len(pad_daily_headlines)):
            pad = vocab_to_int["<PAD>"]
            pad_daily_headlines.append(pad)
    # Limit daily_headlines if they are more than max length
    else:
        pad_daily_headlines = pad_daily_headlines[:max_daily_length]
    pad_headlines.append(pad_daily_headlines)

Size of Vocabulary: 35198
Word embeddings: 400000
Number of words missing from GloVe: 41
Percent of words that are missing from vocabulary: 0.12%
Total Number of Unique Words: 35198
Number of Words we will use: 31816
Percent of Words we will use: 90.39%
31816
Total number of words in headlines: 617175
Total number of UNKs in headlines: 4285
Percent of words that are UNK: 0.69%


In [None]:
# News Prediction Stock Trends
print("************ News Prediction Stock Trends ************\n")

print("Logistic Regression: ")
LRSC(pad_headlines, Y)
print("_____________________________________________________________")
print("\nNaive Bayes: ")
NBC(pad_headlines, Y)



************ News Prediction Stock Trends ************

Logistic Regression: 

True Positives:  115
True Negatives:  193
False Positives:  117
False Negatives:  238
115 117

Precision:  0.4956896551724138

Recall:  0.32577903682719545

f_measure:  0.39316239316239315

Accuracy:  0.4645550527903469
_____________________________________________________________

Naive Bayes: 

True Positives:  197
True Negatives:  142
False Positives:  168
False Negatives:  156
197 168

Precision:  0.5397260273972603

Recall:  0.5580736543909348

f_measure:  0.5487465181058496

Accuracy:  0.5113122171945701
