In [1]:
# %pip list
!jupyter nbconvert --to script lr_test.ipynb


nbconvert: command not found


In [2]:
# Basic setup
import numpy as np
import pandas as pd
%matplotlib inline

In [3]:
# %pip install yfinance

In [4]:
ls

README.md		demo.py			nasdaqlisted.txt
[1m[34mbayesian[39;49m[0m/		features.py		someData.pkl
data.py			lr_test.ipynb


## A few reminder examples of plotting and running a script

In [5]:
# %matplotlib inline
# import numpy as np
# import matplotlib.pyplot as plt

# t = np.arange(0, 5, 0.2)
# plt.plot(t, t, 'r--', t, t**2, 'bs', t, t**3, 'g^')
# plt.show()

In [6]:
# %run demo.py

## Load Nasdaq summary of Symbols

In [7]:
nasdaq = pd.read_csv('nasdaqlisted.txt', delimiter='|')
print(nasdaq.head())


  Symbol                                      Security Name Market Category  \
0   AACG  ATA Creativity Global - American Depositary Sh...               G   
1  AACIU                  Armada Acquisition Corp. I - Unit               G   
2   AADI               Aadi Bioscience, Inc. - Common Stock               S   
3   AADR                AdvisorShares Dorsey Wright ADR ETF               G   
4    AAL       American Airlines Group, Inc. - Common Stock               Q   

  Test Issue Financial Status  Round Lot Size ETF NextShares  
0          N                N             100   N          N  
1          N                N             100   N          N  
2          N                N             100   N          N  
3          N                N             100   Y          N  
4          N                N             100   N          N  


## Decide on training and validation sets

In [48]:
Ntotal = 100
Ntrain = 70

symbols = nasdaq['Symbol'].sample(Ntotal)

symbols_train = list(symbols[:Ntrain].values)
symbols_val= list(symbols[Ntrain:].values)

print( "Train symbols: ", symbols_train, '\n')
print( "Val symbols: ", symbols_val)

Train symbols:  ['JYAC', 'XEL', 'LEGAU', 'AFYA', 'IEUS', 'COLL', 'DTOCU', 'BLDEW', 'CAAS', 'BANX', 'EH', 'EVOP', 'BUSE', 'INBKL', 'SCLEW', 'GMDA', 'MRBK', 'CASY', 'RGLS', 'TAOP', 'FIBK', 'HPK', 'ARRWU', 'TRS', 'LDEM', 'APMIU', 'HYZNW', 'INOD', 'TCBC', 'TANH', 'MACUW', 'COMS', 'BSCS', 'TOMZ', 'LIND', 'GCACU', 'VNDA', 'XPDI', 'FTPAU', 'HCARU', 'ZIVOW', 'BSDE', 'FTAG', 'CIGI', 'CKPT', 'EWEB', 'CMMB', 'ALAC', 'VIA', 'CLBS', 'VWTR', 'CFFI', 'UK', 'TER', 'GMTX', 'EBIX', 'EVGO', 'MRSN', 'NGCA', 'MYMD', 'CCMP', 'MCHI', 'CAN', 'HYMCW', 'DOMO', 'CMLT', 'BRCN', 'HCDIP', 'EQ', 'ATHA'] 

Val symbols:  ['OMCL', 'CPLP', 'CHPMU', 'CINF', 'DCBO', 'CASA', 'PMD', 'LNT', 'SRNGW', 'AGCUU', 'SXTC', 'AXTI', 'TANNL', 'EA', 'MARPS', 'UXIN', 'JUGGW', 'GGMC', 'KAIRU', 'AUVIP', 'MIME', 'FGBI', 'TREE', 'SPRX', 'RUTH', 'RXRAW', 'SOCL', 'XNET', 'ACBA', 'VLDRW']


## Download and format historical data
- download the data
- generate features of interest and labels
- concatenate into a large dataframe


In [49]:
import yfinance as yf
# from features import addFeatures

In [50]:



def addFeatures( tickerDf, closingRollAvgInterval_D = 7, 
                 dailyChangeRollAvgInterval_D = 7, delay = -14):  # days):

    #     delay in days, negative value 

    # make Data a value you can work with
    tickerDf.reset_index(inplace=True)

    # --------
    # daily change features
    #   rolling average, rolling std.
    tickerDf['DailyChange'] = tickerDf['Close'] - tickerDf['Open']
    tickerDf['DailyChangeMean'] = tickerDf.rolling(
        f"{dailyChangeRollAvgInterval_D}D", on='Date')['DailyChange'].mean()
    tickerDf['DailyChangeStd'] = tickerDf.rolling(
        f"{dailyChangeRollAvgInterval_D}D", on='Date')['DailyChange'].std()


    # ------
    # closing mean features ( for estimating good/bad at a delay)
    #   rolling average closing mean
    tickerDf['CloseAvg'] = tickerDf.rolling(
        f"{closingRollAvgInterval_D}D", on='Date')['Close'].mean()


    # -------------------------
    # Time delay of moving average closing
    #   For identifying good/bad


    tickerDf['CloseAvgDelayed'] = tickerDf['CloseAvg'].shift(
        periods=delay, fill_value=np.nan)
    tickerDf['CloseFutureChange'] = tickerDf['CloseAvgDelayed'] - tickerDf['Close']

    # If the average is up at the delay, then good (else bad)
    tickerDf['Good'] = 0  # preset all bad
    tickerDf.loc[tickerDf['CloseFutureChange'] > 0, 'Good'] = 1

    # clean for return
    tickerDf.dropna(inplace=True)
    # tickerDf_updated = tickerDf
    return tickerDf


In [51]:
# ticker = yf.Ticker(symbols_train[0])
# tDF = ticker.history(period='1d', start='2010-1-1', end='2020-1-25')
# tDF.reset_index(inplace=True)

In [52]:

# tDF

In [53]:
def getSymbolDF( symbols):
    ticker_df_list = []
    for tickerStr in symbols:
        ticker = yf.Ticker(tickerStr)
        tDF = ticker.history(period='1d', start='2020-1-1', end='2021-8-25')
        #         tDF = ticker.history(period='1d', start='2010-1-1', end='2020-1-25')
        tDF = addFeatures(tDF)
        ticker_df_list.append(addFeatures(tDF) )
    
    df = pd.concat(ticker_df_list)        
    return df

In [54]:
trainDF = getSymbolDF( symbols_train )
valDF = getSymbolDF( symbols_val )

trainDF.to_pickle( 'train.pkl')
valDF.to_pickle( 'val.pkl')

- BLDEW: No data found for this date range, symbol may be delisted
- SCLEW: No data found for this date range, symbol may be delisted
- HYZNW: No data found for this date range, symbol may be delisted
- MACUW: No data found for this date range, symbol may be delisted
- ZIVOW: No data found for this date range, symbol may be delisted
- HYMCW: No data found for this date range, symbol may be delisted
- JUGGW: No data found for this date range, symbol may be delisted
- RXRAW: No data found for this date range, symbol may be delisted


In [57]:
trainDF

Unnamed: 0,index,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,DailyChange,DailyChangeMean,DailyChangeStd,CloseAvg,CloseAvgDelayed,CloseFutureChange,Good,Adj Close
1,2,2020-11-23,10.00,10.000,9.90,10.00,49600.0,0.0,0.0,0.00,-0.025000,0.035355,10.0000,10.140,0.140,1,
2,3,2020-11-24,9.91,10.000,9.90,10.00,75500.0,0.0,0.0,0.09,0.013333,0.070946,10.0000,10.180,0.180,1,
3,4,2020-11-25,9.97,10.000,9.92,10.00,114900.0,0.0,0.0,0.03,0.017500,0.058524,10.0000,10.210,0.210,1,
4,5,2020-11-27,10.00,10.000,9.96,10.00,32000.0,0.0,0.0,0.00,0.030000,0.042426,10.0000,10.238,0.238,1,
5,6,2020-11-30,9.98,10.000,9.90,9.91,331300.0,0.0,0.0,-0.07,0.012500,0.066521,9.9775,10.250,0.340,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201,202,2021-07-09,10.72,11.050,10.57,10.89,617000.0,0.0,0.0,0.17,0.127500,0.349893,10.7725,10.020,-0.870,0,
202,203,2021-07-12,10.88,11.010,10.43,10.50,479200.0,0.0,0.0,-0.38,0.026000,0.378589,10.7180,9.904,-0.596,0,
203,204,2021-07-13,10.65,11.300,10.46,10.69,853900.0,0.0,0.0,0.04,-0.054000,0.304187,10.6560,9.878,-0.812,0,
204,205,2021-07-14,10.81,10.840,9.95,10.01,1277200.0,0.0,0.0,-0.80,-0.140000,0.444353,10.5320,9.930,-0.080,0,


## Build a simple predictive model

In [58]:
from sklearn.linear_model import LogisticRegression



# clf = LogisticRegression(random_state=0).fit(X, y)
# clf.predict(X[:2, :])
# clf.predict_proba(X[:2, :])
# clf.score(X, y)

In [59]:
feats = ["DailyChangeMean", "DailyChangeStd","CloseFutureChange" ]
target = "Good"

In [64]:
X = trainDF[feats]
y = trainDF[target]

In [65]:
clf = LogisticRegression(random_state=0).fit(X, y)

In [67]:
s = clf.score(X,y)
print( f"Training set accuracy: {s}")

Training set accuracy: 0.9962754125497905


## Check generalization

In [68]:
X_val = valDF[feats]
y_val = valDF[target]

In [70]:
s = clf.score(X_val,y_val)
print( f"Validation set accuracy: {s}")

Validation set accuracy: 0.9965490797546013


## Visualize and Exploratory Data Analysis (EDA)
