# NASDAQ Composite Clustering

In [1]:
import pandas as pd #to deal with the data from the yahoo finance api
import numpy as np #for tensorflow and sklearn
import datetime as dt #datetime adjusting
import pandas_datareader.data as web #api for tick price data
import warnings
warnings.filterwarnings('ignore') #ignore warnings

from sklearn.neighbors import KNeighborsClassifier #for seeing which cluster a small, value, momentum stock would fall in
from sklearn.metrics import accuracy_score #the accuracy score of the k nearest neighbor classifier (based on clusters)
from sklearn.cluster import KMeans # might end up using this library
import tensorflow as tf #machine learning libraries used for data splitting and model creation
from tensorflow import keras #for the deep learning model at the end
from sklearn.preprocessing import MinMaxScaler #normalize data
from sklearn.model_selection import train_test_split #split data
from sklearn.metrics import r2_score #metric
from tensorflow.keras.models import Sequential, Model #deep learning sequential model
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Concatenate #layers for deep learing 
from keras import optimizers #optimizer for deep learning model (will most likely end up being Adam)

Using TensorFlow backend.


# Data Cleaning / Query

In [2]:
excel = pd.read_excel('nasdaq_comp2013.xlsx').dropna() #dropping nan (loading in NASDAQ 2013/12/31)
def data(excel): #function for cleaning data
    excel['Ticker'] = excel.Ticker.str.split(' ').str[0] #splitting the rest off the Ticker for querying
    excel = excel[['Ticker','Market Cap','P/B','1Y Tot Ret']] #these are the features I want for the end
    return excel.take(np.random.permutation(len(excel))[:150]) #taking a random 100 selection of the NASDAQ Composite

In [3]:
def total_returns(df): #queries all the returns for 2014 to 2019 based on size, value, and momentum
    failed = [] #failed list
    passed = [] #passed list
    data = pd.DataFrame() #create a dataframe for the loop
    for ticker in df['Ticker'].values: #loop through all the tickers 
        try: #try clause
            data[ticker] = web.DataReader(ticker, "yahoo", '2014-01-01', '2016-01-01')["Adj Close"] #6 year tick price data
            passed.append(ticker) #append passed tickers to the list
        except (IOError, KeyError): #error
            failed.append(ticker) #failed tickers
    return data #average yearly return of all found securities in the database

r = data(excel) #clean the data collected from Bloomberg
df = total_returns(r) #a few of these stocks will not be in the API
r = r.set_index('Ticker') #set index as Ticker to combine the datasets
data = ((df.iloc[-1]-df.iloc[0])/df.iloc[0])*100 #return after two years
r = r.loc[data.index] #making sure both datasets have the same tickers in each 
main = pd.merge(r, pd.DataFrame(data), left_index=True, right_index=True).dropna() #merging and dropping any null values

# Cleaning the Data Further for Better Model

In [4]:
maxima = 10 #10 percent return (negative and positive will be where I will classify each return)
main.loc[(main[0] >= (-maxima)) & (main[0] <= 0),'Label'] = 2 #greater than (-10) but less than 0
main.loc[(main[0] >= (0)) & (main[0] <= maxima),'Label'] = 3 #greater than 0 but less than 10
main.loc[(main[0] > maxima),'Label'],main.loc[(main[0] < -maxima),'Label'] = 4,1 # 4 is greatest returns
main.loc[(main['1Y Tot Ret'] > 0),'1Y Tot Ret'] = 1 #momentum stock
main.loc[(main['1Y Tot Ret'] < 0),'1Y Tot Ret'] = 0 #not a momentum stock
new = main[['Market Cap','P/B','1Y Tot Ret','Label']] #features I want plus the target label

In [11]:
def nearest_neighbors(new):
    X = new.iloc[:,0:3] #size, value, and momentum features
    y = new.iloc[:,3] #return labels
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.33, random_state=42) #splitting the data
    for k in list(range(1,15)): #using a different value of k 
        neigh = KNeighborsClassifier(n_neighbors=k).fit(X_train,y_train) #fitting model to training data
        y_pred = neigh.predict(X_test) #predicting using the testing features 
        acc = accuracy_score(y_test,y_pred) #accuracy score of 50% shows that these features show some merit
        test = neigh.predict([[2000000000,5,1]]) #small cap, value p/b ratio (it would depend on sector), and momentum stock 
        print('k =', k, ': ', round(acc,3),'\nSMB-HML-UMD: ',test,'\n') #small, value, momentum stock to fit into what cluster
        
nearest_neighbors(new) #most would fit into the highest earning cluster (especially once the model has a higher accuracy score)

k = 1 :  0.333 
SMB-HML-UMD:  [2.] 

k = 2 :  0.333 
SMB-HML-UMD:  [2.] 

k = 3 :  0.333 
SMB-HML-UMD:  [2.] 

k = 4 :  0.385 
SMB-HML-UMD:  [2.] 

k = 5 :  0.436 
SMB-HML-UMD:  [4.] 

k = 6 :  0.41 
SMB-HML-UMD:  [4.] 

k = 7 :  0.487 
SMB-HML-UMD:  [4.] 

k = 8 :  0.436 
SMB-HML-UMD:  [4.] 

k = 9 :  0.513 
SMB-HML-UMD:  [4.] 

k = 10 :  0.59 
SMB-HML-UMD:  [4.] 

k = 11 :  0.564 
SMB-HML-UMD:  [4.] 

k = 12 :  0.564 
SMB-HML-UMD:  [4.] 

k = 13 :  0.564 
SMB-HML-UMD:  [4.] 

k = 14 :  0.59 
SMB-HML-UMD:  [4.] 



# Select Security based on Carhart Factors

In [6]:
#analyst strong buy recommendation, small cap, p/b ratio under 5, and year to date positive returns (momentum)
import requests
def screener():
    url = 'https://finviz.com/screener.ashx?v=141&f=an_recom_strongbuy,cap_small,fa_pb_u5,ta_perf_52wup&o=-perf52w'
    resp = requests.get(url,headers={'user-agent': 'my-app/0.0.1'}) #scraping the url
    l = [] #empty list to be added to after scraping from FinViz
    for ticker in resp.text.split('href="quote.ashx?t='):
        tickers = (ticker.split('&ty')[0]) #selecting security with carhart four factors (best ytd performance)
        l.append(tickers) #getting all the tickers within the screener
    return list(set(l[-320:])) #removing the many duplicate tickers within the screener

In [7]:
def data(ticker,start,end):
    df = web.DataReader(ticker.upper(),'yahoo',start,end)[['Adj Close','High','Low']]
    df['sm3'] = df['Adj Close'].rolling(window=3).mean() #3 day moving average
    df['sm9'] = df['Adj Close'].rolling(window=9).mean() #9 day moving average
    drivers = ['^VIX','SPY'] #vix and the market benchmark
    portfolio = pd.DataFrame() #create an emy dataframe to place data 
    failed = [] #empty list for failed queries
    for stock in drivers: #for loop for all the features I want to try
        try: #try clause just in case it failed
            portfolio[stock] = web.DataReader(stock,'yahoo',start,end)['Adj Close'] #adj close of each feature
            portfolio[f'{stock} sma3'] = portfolio[stock].rolling(window=3).mean() #moving averages as features
            portfolio[f'{stock} sma9'] = portfolio[stock].rolling(window=9).mean()
            portfolio = portfolio.drop([stock],axis=1) #drop the adj close price
        except:
            failed.append(stock) #add failed query ticker to list
            print(f'{failed} was not properly calculated for. Are you sure this ticker is on an exchange?')
    data = pd.concat([df,portfolio],axis = 1).dropna() #combine both dataframes
    df['Target'] = df['Adj Close'].shift(-1) #shift adj close back 1: we are forecasting one day into the future
    df = df.drop(['Adj Close','High','Low'],axis =1) #drop these bc nn works better 
    return df.dropna() #no null values



def predict(df,pred_df):
    X = df[df.columns] #feature data
    del X['Target'] #don't want target variable in training data
    Y = df[['Target']] #label data
    x_train,x_test,y_train,y_test = train_test_split(X,Y,random_state=50,test_size=0.2) #training and testing
    x_val,x_test,y_val,y_test = train_test_split(x_test,y_test,random_state=50,test_size=0.5) #test and validation
    model = Sequential() #basic sequential model 
    model.add(Dense(100, input_dim=x_train.shape[1], #need the input shape of the data in tensorflow 2x
                        activation=tf.nn.leaky_relu, # was better than relu
                        kernel_initializer='he_normal')) #initializer to stop from vanishing/exploding gradient
    model.add(Dense(75, input_dim=100, #100 'neurons' in the input layer
                        activation=tf.nn.leaky_relu, 
                        kernel_initializer='he_normal'))
    model.add(Dense(50, input_dim=75, #75 'neurons' in the first hidden layer
                    activation=tf.nn.leaky_relu,
                    kernel_initializer='he_normal'))
    model.add(Dense(25, input_dim=50, #50 'neurons' in the second hidden layer
                    activation=tf.nn.leaky_relu,
                    kernel_initializer='he_normal'))
    model.add(Dense(1, activation=tf.nn.leaky_relu, #only one answer so you need one 'neuron'
                        kernel_initializer='he_normal'))
    model.compile(loss='mean_squared_error', #mse loss function
                      optimizer='adam', #adam optimizer
                      metrics=['mape']) #mean absolute percentage error metric to determine the performance of the model
    scaler = MinMaxScaler() #normalize the data since it is pretty different in terms of share price
    x_train_scaled = scaler.fit_transform(x_train) #apply the normalizer to the training features
    history = model.fit(x_train, y_train,  #fit the training data to the model
                        validation_data=(x_val, y_val), #validation data to better see how the model is doing
                        batch_size=32,
                        epochs=5,
                        verbose=0)
    pred_features = pred_df.iloc[-1] #these will be in the pred_data function 
    prediction = model.predict(np.array([pred_features])) #need it in numpy array 
    mape = history.history['val_mape'] #getting the mean absolute percentage error to see which is the lowest
    mape = ["%.2f" % i for i in mape] #rounding it to two decimal places
    print(f"Value Mean Absolute Percentage Error {mape}") #print statement to look cleaner once the code is run
    print(f'The predicted stock price for {ticker.upper()} tomorrow is ${round(float(prediction[0]),2)}.')
    if float(prediction[0])>float(df['Target'].iloc[-1:].values): #buy or sell statement based on model prediction and real time tick data
        print('Buy: ', round(float(prediction[0]),2), '>',round(float(df['Target'].iloc[-1:].values),2),
              '\nPCT DIFF: ',round((float(prediction[0]) - float(df['Target'].iloc[-1:].values))/float(df['Target'].iloc[-1:].values)*100,2),'%',)
    else:
        print('Sell: ', round(float(prediction[0]),2), '<',round(float(df['Target'].iloc[-1:].values),2),
              '\nPCT DIFF: ',round((float(prediction[0]) - float(df['Target'].iloc[-1:].values))/float(df['Target'].iloc[-1:].values)*100,2),'%')



def pred_data(ticker,start,end):
    df = web.DataReader(ticker.upper(),'yahoo',start,end)[['Adj Close','High','Low']] #query data of target security
    df['sm3'] = df['Adj Close'].rolling(window=3).mean() #moving averages (3 day)
    df['sm9'] = df['Adj Close'].rolling(window=9).mean() #moving averages (9 day)
    drivers = ['^VIX','SPY'] #volatility and sp500
    portfolio = pd.DataFrame() #create an emy dataframe to place data 
    failed = [] #empty list for failed queries
    for stock in drivers: #for loop for all the features I want to try
        try: #try clause just in case it failed
            portfolio[stock] = web.DataReader(stock,'yahoo',start,end)['Adj Close'] #adj close of each feature
            portfolio[f'{stock} sma3'] = portfolio[stock].rolling(window=3).mean() #moving averages as features
            portfolio[f'{stock} sma9'] = portfolio[stock].rolling(window=9).mean()
            portfolio = portfolio.drop([stock],axis=1) #drop the adj close price
        except:
            failed.append(stock) #add failed query ticker to list
            print(f'{failed} was not properly calculated for. Are you sure this ticker is on an exchange?')
    data = pd.concat([df,portfolio],axis = 1).dropna() #combine both dataframes
    df = df.drop(['Adj Close','High','Low'],axis = 1)
    return df.dropna()

        
        
if __name__ == '__main__':
    tf.get_logger().setLevel('ERROR') #getting rid of tensorflow errors
    tickers = screener() #target securities that fit the screener of small, value, momentum, and analyst strong buy
    for ticker in tickers: #looping through screened stocks
        print(f'|||Chosen Security: {ticker}|||') #printing the stock that is being tested by the model
        try: #try clause just in case API doesn't have that ticker
            start = dt.datetime.now() - dt.timedelta(days=365*5) #5 year time frame
            end = dt.datetime.now() #today
            predict(data(ticker,start,end),pred_data(ticker,start,end)) #calling the function we want to predict the target security
        except (IOError, KeyError): #error
            pass #keep running if an error is raised

|||Chosen Security: IDYA|||
Value Mean Absolute Percentage Error ['12.50', '10.53', '8.05', '5.97', '5.91']
The predicted stock price for IDYA tomorrow is $16.4.
Buy:  16.4 > 16.24 
PCT DIFF:  0.96 %
|||Chosen Security: VRNA|||
Value Mean Absolute Percentage Error ['7.94', '4.98', '5.97', '4.07', '4.69']
The predicted stock price for VRNA tomorrow is $8.06.
Buy:  8.06 > 7.9 
PCT DIFF:  2.08 %
|||Chosen Security: ITCI|||
Value Mean Absolute Percentage Error ['11.62', '6.07', '4.77', '5.25', '5.70']
The predicted stock price for ITCI tomorrow is $24.52.
Sell:  24.52 < 24.58 
PCT DIFF:  -0.24 %
|||Chosen Security: BWMX|||
Value Mean Absolute Percentage Error ['23.68', '16.67', '8.73', '3.43', '2.56']
The predicted stock price for BWMX tomorrow is $29.81.
Sell:  29.81 < 29.85 
PCT DIFF:  -0.14 %
|||Chosen Security: NLTX|||
Value Mean Absolute Percentage Error ['9.59', '4.99', '4.70', '4.50', '4.63']
The predicted stock price for NLTX tomorrow is $12.07.
Buy:  12.07 > 12.0 
PCT DIFF:  0.59 