In [1]:
# %pip list
#!jupyter nbconvert --to script lr_test.ipynb


In [2]:
# Basic setup
import numpy as np
import pandas as pd
%matplotlib inline

In [3]:
# %pip install yfinance

In [4]:
ls

README.md		features.py		someData.pkl
[1m[34mbayesian[39;49m[0m/		lr_test.ipynb		train.pkl
data.py			lr_test.py		val.pkl
demo.py			nasdaqlisted.txt


## A few reminder examples of plotting and running a script

In [5]:
# %matplotlib inline
# import numpy as np
# import matplotlib.pyplot as plt

# t = np.arange(0, 5, 0.2)
# plt.plot(t, t, 'r--', t, t**2, 'bs', t, t**3, 'g^')
# plt.show()

In [6]:
# %run demo.py

## Load Nasdaq summary of Symbols

In [7]:
nasdaq = pd.read_csv('nasdaqlisted.txt', delimiter='|')
print(nasdaq.head())


  Symbol                                      Security Name Market Category  \
0   AACG  ATA Creativity Global - American Depositary Sh...               G   
1  AACIU                  Armada Acquisition Corp. I - Unit               G   
2   AADI               Aadi Bioscience, Inc. - Common Stock               S   
3   AADR                AdvisorShares Dorsey Wright ADR ETF               G   
4    AAL       American Airlines Group, Inc. - Common Stock               Q   

  Test Issue Financial Status  Round Lot Size ETF NextShares  
0          N                N             100   N          N  
1          N                N             100   N          N  
2          N                N             100   N          N  
3          N                N             100   Y          N  
4          N                N             100   N          N  


## Decide on training and validation sets

In [8]:
Ntotal = 100
Ntrain = 70

symbols = nasdaq['Symbol'].sample(Ntotal)

symbols_train = list(symbols[:Ntrain].values)
symbols_val= list(symbols[Ntrain:].values)

print( "Train symbols: ", symbols_train, '\n')
print( "Val symbols: ", symbols_val)

Train symbols:  ['RXRA', 'LNSR', 'BSMT', 'SMH', 'VCKAU', 'ALT', 'MSON', 'URGN', 'PCSA', 'VIAC', 'PWFL', 'FLMN', 'CPTAG', 'ONVO', 'COMSW', 'MLAB', 'GLAD', 'PAYS', 'KLAC', 'VIHAU', 'ZNTEU', 'OCGN', 'HYRE', 'GLBS', 'BHFAN', 'HOFT', 'QTR', 'GWII', 'PWP', 'FITBP', 'PSCI', 'STFC', 'SVVC', 'CENTA', 'DARE', 'ZVO', 'GCACU', 'CYXTW', 'LFVN', 'METC', 'LAMR', 'ZXYZ.A', 'JANX', 'AEY', 'QUIK', 'ESGRO', 'IMGN', 'PACX', 'NXTC', 'FWP', 'TVACU', 'PBIP', 'EPHYU', 'PFG', 'DLTR', 'AXNX', 'LVRAW', 'MON', 'TCDA', 'SGMA', 'EVBG', 'OTMO', 'WDC', 'TRON', 'PDCE', 'KPTI', 'SFIX', 'LSXMK', 'SKYA', 'CELC'] 

Val symbols:  ['EACPU', 'ESSCW', 'NUWE', 'KLXE', 'ADSK', 'AIH', 'CMBM', 'PBTS', 'EYES', 'OCCIP', 'PLCE', 'IDRA', 'BSAE', 'LMNL', 'GANX', 'WDFC', 'SILK', 'AIRR', 'ESLT', 'SOLOW', 'OCG', 'MRCY', 'BSMN', 'EJFAU', 'SGAM', 'IMXI', 'FAD', 'CUE', 'SLCR', 'ANPC']


## Download and format historical data
- download the data
- generate features of interest and labels
- concatenate into a large dataframe


In [9]:
import yfinance as yf
# from features import addFeatures

In [10]:



def addFeatures( tickerDf, closingRollAvgInterval_D = 7, 
                 dailyChangeRollAvgInterval_D = 7, delay = -14):  # days):

    #     delay in days, negative value 

    # make Data a value you can work with
    tickerDf.reset_index(inplace=True)

    # --------
    # daily change features
    #   rolling average, rolling std.
    tickerDf['DailyChange'] = tickerDf['Close'] - tickerDf['Open']
    tickerDf['DailyChangeMean'] = tickerDf.rolling(
        f"{dailyChangeRollAvgInterval_D}D", on='Date')['DailyChange'].mean()
    tickerDf['DailyChangeStd'] = tickerDf.rolling(
        f"{dailyChangeRollAvgInterval_D}D", on='Date')['DailyChange'].std()


    # ------
    # closing mean features ( for estimating good/bad at a delay)
    #   rolling average closing mean
    tickerDf['CloseAvg'] = tickerDf.rolling(
        f"{closingRollAvgInterval_D}D", on='Date')['Close'].mean()


    # -------------------------
    # Time delay of moving average closing
    #   For identifying good/bad


    tickerDf['CloseAvgDelayed'] = tickerDf['CloseAvg'].shift(
        periods=delay, fill_value=np.nan)
    tickerDf['CloseFutureChange'] = tickerDf['CloseAvgDelayed'] - tickerDf['Close']

    # If the average is up at the delay, then good (else bad)
    tickerDf['Good'] = 0  # preset all bad
    tickerDf.loc[tickerDf['CloseFutureChange'] > 0, 'Good'] = 1

    # clean for return
    tickerDf.dropna(inplace=True)
    # tickerDf_updated = tickerDf
    return tickerDf


In [11]:
# ticker = yf.Ticker(symbols_train[0])
# tDF = ticker.history(period='1d', start='2010-1-1', end='2020-1-25')
# tDF.reset_index(inplace=True)

In [12]:

# tDF

In [13]:
def getSymbolDF( symbols):
    ticker_df_list = []
    for tickerStr in symbols:
        ticker = yf.Ticker(tickerStr)
        tDF = ticker.history(period='1d', start='2020-1-1', end='2021-8-25')
        #         tDF = ticker.history(period='1d', start='2010-1-1', end='2020-1-25')
        tDF = addFeatures(tDF)
        ticker_df_list.append(addFeatures(tDF) )
    
    df = pd.concat(ticker_df_list)        
    return df

In [14]:
trainDF = getSymbolDF( symbols_train )
valDF = getSymbolDF( symbols_val )

trainDF.to_pickle( 'train.pkl')
valDF.to_pickle( 'val.pkl')

- COMSW: No data found for this date range, symbol may be delisted
- QTR: Data doesn't exist for startDate = 1577862000, endDate = 1629871200
- GWII: Data doesn't exist for startDate = 1577862000, endDate = 1629871200
- CYXTW: No data found for this date range, symbol may be delisted
- ZXYZ.A: No data found, symbol may be delisted
- LVRAW: No data found for this date range, symbol may be delisted
- ESSCW: No data found for this date range, symbol may be delisted
- SOLOW: No data found for this date range, symbol may be delisted


In [15]:
trainDF

Unnamed: 0,index,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,DailyChange,DailyChangeMean,DailyChangeStd,CloseAvg,CloseAvgDelayed,CloseFutureChange,Good,Adj Close
1,2,2021-05-03,9.740000,9.750000,9.700000,9.700000,13300.0,0.0,0.0,-0.040000,-0.055000,0.021214,9.700000,9.8000,0.100000,1,
2,3,2021-05-04,9.700000,9.799000,9.700000,9.750000,103100.0,0.0,0.0,0.050000,-0.020000,0.062450,9.716667,9.8480,0.098000,1,
3,4,2021-05-05,9.700000,9.760000,9.700000,9.750000,35900.0,0.0,0.0,0.050000,-0.002500,0.061847,9.725000,9.8836,0.133600,1,
4,5,2021-05-06,9.730000,9.730000,9.730000,9.730000,100.0,0.0,0.0,0.000000,-0.002000,0.053573,9.726000,9.8376,0.107600,1,
5,6,2021-05-07,9.730000,9.730000,9.730000,9.730000,0.0,0.0,0.0,0.000000,0.012000,0.038341,9.732000,9.8376,0.107600,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
381,382,2021-07-09,24.230000,25.540001,24.230000,24.870001,51100.0,0.0,0.0,0.640001,0.377500,0.580367,24.290000,20.8140,-4.056001,0,
382,383,2021-07-12,24.879999,24.945999,23.480000,24.120001,64100.0,0.0,0.0,-0.759998,0.150000,0.715122,24.256001,20.4940,-3.626001,0,
383,384,2021-07-13,23.980000,24.180000,23.105000,23.410000,72800.0,0.0,0.0,-0.570000,-0.007999,0.780109,24.088000,20.4000,-3.010000,0,
384,385,2021-07-14,23.500000,24.285000,23.290001,23.700001,62100.0,0.0,0.0,0.200001,0.102001,0.758300,24.090001,20.4120,-3.288000,0,


## Build a simple predictive model

In [16]:
from sklearn.linear_model import LogisticRegression



# clf = LogisticRegression(random_state=0).fit(X, y)
# clf.predict(X[:2, :])
# clf.predict_proba(X[:2, :])
# clf.score(X, y)

In [27]:
feats = ["DailyChangeMean", "DailyChangeStd" ]
target = "Good"

In [28]:
X = trainDF[feats]
y = trainDF[target]

In [29]:
clf = LogisticRegression(random_state=0).fit(X, y)

In [30]:
s = clf.score(X,y)
print( f"Training set accuracy: {s}")

Training set accuracy: 0.5292419491314518


## Check generalization

In [31]:
X_val = valDF[feats]
y_val = valDF[target]

In [32]:
s = clf.score(X_val,y_val)
print( f"Validation set accuracy: {s}")

Validation set accuracy: 0.5324632230947904


## Visualize and Exploratory Data Analysis (EDA)


In [None]:
trainDF['Good'].value_counts()