# Data Creation for Excel Sheets to be applied to the Models

In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_hub as hub
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
from collections import Counter
import pandas as pd
import numpy as np
import datetime as dt
import string 
import pandas_datareader.data as web
import warnings
warnings.filterwarnings('ignore')

def dataset(df,number,ticker):
    df['Date'] = df['Date'].astype('datetime64[ns]') #converting date to datetime format to properly combine dataframes
    df = df.set_index('Date') #setting the index to concat on date
    df = df.drop(['Label'],axis=1)
    new = [] #creating an empty list to be used during the for loop below
    for column in df.columns: #loop through the 25 columns to prepare for formatting the strings
        df[column] = (df[column].str.replace("b'", "")) #cleaning the strings of this random letter b"
        new.append(df[column].str.replace('b"', "")) #appending the cleaned strings to a list
    main = pd.DataFrame(data = new).T #transposing the dataset for proper formating 
    stock = web.DataReader(ticker,'yahoo','2008-08-08','2016-07-01')[['Adj Close','High','Low']] #getting adj close price data of SPY
    data = pd.concat([main,stock],axis=1) #concatenating the two dataframes
    data['Target'] = data['Adj Close'].pct_change()*100 #getting the return of the stock 
    data.loc[data['Target']<(0),'Label'] = 0 #negative returns are 0
    data.loc[data['Target']>(0),'Label'] = 1 #positive returns are 1
    
    #was  used for the multi-classification model 
    #data.loc[data['Target']<(-3.0),'Label'] = 0 #worst returns are 0
    #data.loc[(data['Target']>=(-3.0)) & (data['Target']<(-1.5)),'Label'] = 1 #terrible returns are 1 
    #data.loc[(data['Target']>=(-1.5)) & (data['Target']<0),'Label'] = 2 #bad returns are 2
    #data.loc[(data['Target']>=(0.0)) & (data['Target']<1.5),'Label'] = 3 #decent returns are 3
    #data.loc[(data['Target']>=(1.5)) & (data['Target']<=3.0),'Label'] = 4 #good returns are 4 
    #data.loc[data['Target']>3.0,'Label'] = 5 #highest returns are 5
    
    data = data.drop(['Adj Close','High','Low','Target'],axis =1) #drop these bc we're solving for return labels
    headlines = data.columns[:number] #number of top headlines for the day
    data['combined'] = data[headlines].apply(lambda row: '. '.join(row.values.astype(str)), axis=1) #combining the headlines
    data = data.drop(headlines,axis = 1) #dropping the columns that I merged
    return data.dropna() #no null values for TensorFlow

# https://stackoverflow.com/questions/34293875/how-to-remove-punctuation-marks-from-a-string-in-python-3-x-using-translate/34294022
def remove_punct(text):
    translator = str.maketrans("", "", string.punctuation) #removing all punctuation in a string 
    return text.translate(translator) 

# https://stackoverflow.com/questions/5486337/how-to-remove-stop-words-using-nltk-or-python
def remove_stopwords(text):
    filtered_words = [word.lower() for word in text.split() if word.lower() not in set(stopwords.words("english"))] #converting all letters to lowercase and removing if they are classified as stop words
    return " ".join(filtered_words)

#count the number of words in a string using collections Counter
def counter(string):
    number = Counter() #calling Counter from collections 
    for i in string.values: #looping through each string 
        for word in i.split(): #looping through each word in the string 
            number[word] += 1 #counting everything there is a word
    return number

# Binary NLP Model [25 headlines]

In [2]:
def binary_model(df,number,max_length):
    y,x = df['Label'],df['combined'] #label and feature
    x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=50,test_size=0.3) #training and testing
    tokenizer = Tokenizer(num_words=number) #calling the tokenizer to fit the unique amount of words in the dataset
    tokenizer.fit_on_texts(x_train) # fit integers to training headlines
    x_train_seq = tokenizer.texts_to_sequences(x_train) #converting x_train to sequences given the total number of unique words
    x_test_seq = tokenizer.texts_to_sequences(x_test) #converting x_test to sequences
    padded_x_train = pad_sequences(x_train_seq, maxlen=max_length, padding="post", truncating="post") #padding the training features 
    padded_x_test = pad_sequences(x_test_seq, maxlen=max_length, padding="post", truncating="post") #padding the testing features
    model = keras.models.Sequential() #sequential model
    model.add(layers.Embedding(number, 32, input_length=max_length)) #embedding layer on the total number of unique words
    model.add(layers.LSTM(64, dropout=0.1)) #lstm layer with a 10% dropout
    model.add(layers.Dense(1, activation="sigmoid")) # binary classification problem
    model.compile(loss=keras.losses.BinaryCrossentropy(from_logits=False), optimizer='Adam', metrics=["accuracy"])
    model.fit(padded_x_train, y_train, epochs=3, validation_data=(padded_x_test, y_test), verbose=2)
    return model

#all 25 headlines
df = pd.read_excel('nlp_excel_binary.xlsx') #getting the dataset that was saved from the dataset function
number = counter(df.combined) #to properly tokenize
number = len(number) #length of the number of unique words for the tokenizer
binary_model(df,number,32) #binary model results

Epoch 1/3
44/44 - 9s - loss: 0.6902 - accuracy: 0.5505 - val_loss: 0.6866 - val_accuracy: 0.5613
Epoch 2/3
44/44 - 2s - loss: 0.6457 - accuracy: 0.5938 - val_loss: 0.7080 - val_accuracy: 0.5042
Epoch 3/3
44/44 - 2s - loss: 0.3328 - accuracy: 0.9185 - val_loss: 0.9558 - val_accuracy: 0.5277


<tensorflow.python.keras.engine.sequential.Sequential at 0x145e5e890>

# Binary NLP Model [5 headlines]

In [3]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_hub as hub
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from collections import Counter
import pandas as pd
import numpy as np
import datetime as dt
import string 
import pandas_datareader.data as web
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('Combined_News_DJIA.csv') #was a csv with top 25 world news headlines
ticker = 'spy'
number = 5
def data(df,ticker,number):
    df['Date'] = df['Date'].astype('datetime64[ns]') #converting date to datetime format to properly combine dataframes
    df = df.set_index('Date') #setting the index to concat on date
    df = df.drop(['Label'],axis =1)
    new = [] #creating an empty list to be used during the for loop below
    for column in df.columns: #loop through the 25 columns to prepare for formatting the strings
        df[column] = (df[column].str.replace("b'", "")) #cleaning the strings of this random letter b"
        new.append(df[column].str.replace('b"', "")) #appending the cleaned strings to a list
    main = pd.DataFrame(data = new).T #transposing the dataset for proper formating 
    stock = web.DataReader(ticker,'yahoo','2008-08-08','2016-07-01')[['Adj Close','High','Low']] #getting adj close price data of SPY
    data = pd.concat([main,stock],axis=1) #concatenating the two dataframes
    data['Target'] = data['Adj Close'].pct_change()*100 #getting the return of the stock 
    data.loc[data['Target']<(0),'Label'] = 0 #negative returns are 0
    data.loc[data['Target']>(0),'Label'] = 1 #positive returns are 1
    data = data.drop(['Adj Close','High','Low','Target'],axis =1) #drop these bc we're solving for return labels
    headlines = data.columns[:-1] #number of top headlines for the day
    data['combined'] = data[headlines].apply(lambda row: '. '.join(row.values.astype(str)), axis=1) #combining the headlines
    data = data.drop(headlines,axis = 1) #dropping the columns that I merged
    return data.dropna()

# https://stackoverflow.com/questions/34293875/how-to-remove-punctuation-marks-from-a-string-in-python-3-x-using-translate/34294022
def remove_punct(text):
    translator = str.maketrans("", "", string.punctuation) #removing all punctuation in a string 
    return text.translate(translator) 

# https://stackoverflow.com/questions/5486337/how-to-remove-stop-words-using-nltk-or-python
def remove_stopwords(text):
    filtered_words = [word.lower() for word in text.split() if word.lower() not in set(stopwords.words("english"))] #converting all letters to lowercase and removing if they are classified as stop words
    return " ".join(filtered_words)

#count the number of words in a string using collections Counter [Tensorflow tutorial]
def counter(string):
    number = Counter() #calling Counter from collections 
    for i in string.values: #looping through each string 
        for word in i.split(): #looping through each word in the string 
            number[word] += 1 #counting everything there is a word
    return number

df = data(pd.read_csv('Combined_News_DJIA.csv'),ticker,number)
number = counter(df.combined) #to properly tokenize
number = len(number) #length of the number of unique words for the tokenizer
binary_model(pd.read_excel('nlp_5headline.xlsx'),number,32) #binary model results

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ryanfinegan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Epoch 1/3
44/44 - 7s - loss: 0.6920 - accuracy: 0.5303 - val_loss: 0.6861 - val_accuracy: 0.5613
Epoch 2/3
44/44 - 3s - loss: 0.6552 - accuracy: 0.5866 - val_loss: 0.6890 - val_accuracy: 0.5277
Epoch 3/3
44/44 - 3s - loss: 0.4117 - accuracy: 0.8939 - val_loss: 0.7511 - val_accuracy: 0.4874


<tensorflow.python.keras.engine.sequential.Sequential at 0x147495e50>

# Multiclass NLP Model [25 Headlines]

In [4]:
def multiclass_model(df,number,max_length):
    y,x = df['Label'],df['combined'] #label and feature
    x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=50,test_size=0.3) #training and testing (shuffling)
    tokenizer = Tokenizer(num_words=number) #calling the tokenizer to fit the unique amount of words in the dataset
    tokenizer.fit_on_texts(x_train) # fit integers to training headlines
    x_train_seq = tokenizer.texts_to_sequences(x_train) #x_train to sequences
    x_test_seq = tokenizer.texts_to_sequences(x_test) #x_test to sequences
    padded_x_train = pad_sequences(x_train_seq, maxlen=max_length, padding="post", truncating="post") #padding the sequences
    padded_x_test = pad_sequences(x_test_seq, maxlen=max_length, padding="post", truncating="post") 
    model = keras.models.Sequential() #sequential model 
    model.add(layers.Embedding(number, 32, input_length=max_length)) #embedding layer provides weights to integer (words)
    model.add(layers.LSTM(64, dropout=0.2)) #LSTM layer with a 20% dropout
    model.add(layers.Dense(6, activation="softmax")) #softmax for probability of the six classes (6 in the output layer for classes)
    model.compile(loss=keras.losses.SparseCategoricalCrossentropy(from_logits=False), optimizer='Adam', metrics=["accuracy"])
    model.fit(padded_x_train, y_train, epochs=3, validation_data=(padded_x_test, y_test), verbose=2)
    
df = pd.read_excel('nlp_excel_data.xlsx') #dataframe created and saved with the dataset function
number = counter(df.combined) #to properly tokenize
number = len(number) #length of the number of unique words for the tokenizer
multiclass_model(df,number,32) #multiclassification model 

Epoch 1/3
44/44 - 6s - loss: 1.4395 - accuracy: 0.4486 - val_loss: 1.2248 - val_accuracy: 0.4807
Epoch 2/3
44/44 - 2s - loss: 1.1919 - accuracy: 0.4687 - val_loss: 1.2049 - val_accuracy: 0.4807
Epoch 3/3
44/44 - 2s - loss: 1.0005 - accuracy: 0.5191 - val_loss: 1.3243 - val_accuracy: 0.4255
