In [1]:
import os

dataPath = '/home/endre/Endre_finance_ml_data/'
splittedPath = dataPath + 'generated/WIKI_PRICES_SPLIT/'

numberOfFiles = 0
namePathDict = {}
for subdir, dirs, files in os.walk(splittedPath):
    for file in files:
        filePath = os.path.join(subdir, file)
        namePathDict[file.split('.')[0]] = filePath
        numberOfFiles += 1

print("Total number of files:", numberOfFiles)

Total number of files: 3188


In [2]:
import pandas as pd
# import numpy as np

def readTickerAsPandas(ticker):
    return pd.read_csv(splittedPath+ticker+'.csv')

In [3]:
class BadNumberInArrayError(ValueError):
    '''There was a bad number in the array'''

def percentChange(old, new):
    return ((new-old)/old)*100.0

import math
def checkBadNumbers(rangeName, array):
    for num in array:
        if (math.isnan(num) or not math.isfinite(num)):
            raise BadNumberInArrayError("Got a bad number in this array: {} -> {}".format(rangeName, array))

def featurizeTicker(rangeNames, features, labels, ticker, df):
    selected = df[["date", "adj_close", "adj_volume"]]
    
    # 21 days per month, 12 months = roughly a year.
    numDaysPeriod = 21
    numPeriods = 18
    jumpDays = 3  # Number of days to jump between each feature (reduce number of feature sets)

    rangeDays = numDaysPeriod * numPeriods

    # RANGE: "the entire year" - which spans the feature set
    # PERIOD: "each month"
    
    featureSetDate = None
    
    idx = 0
    for featureSetIdx in range(rangeDays-1, selected.shape[0] - numDaysPeriod, jumpDays):
        # DEBUG::
        idx += 1
        if (idx > rangeDays*43525324532):
            break
        # DEBUG END!

        # The "name" of this feature set
        featureSetDate = selected.iloc[featureSetIdx, 0]

        # The start of this feature set's range
        rangeStartIdx = featureSetIdx-rangeDays+1
        # The end of this feature set's range
        rangeEndIdx = featureSetIdx
        # The start of this feature set's range
        rangeStartDate = selected.iloc[rangeStartIdx, 0]
        
        # The volume average over the range preceeding this feature date
        averageVolumeOverRange = selected.iloc[rangeStartIdx:rangeEndIdx+1, 2].mean()

        rangeStartPrice = selected.iloc[rangeStartIdx, 1]

        ## NAME THIS RANGE
        rangeName = []
        rangeNames.append(rangeName)
        rangeName.append(ticker)
        rangeName.append(featureSetDate)
        
        ## CREATE THE FEATURES INTO 'features'
        rangeFeatures = []
        features.append(rangeFeatures)
        for preceedingPeriodIdx in range(-numPeriods+1, 1):
            periodStartIdx = featureSetIdx + (preceedingPeriodIdx - 1) * numDaysPeriod + 1
            periodEndIdx = featureSetIdx + preceedingPeriodIdx * numDaysPeriod
            periodStartDate = selected.iloc[periodStartIdx, 0]
            periodEndDate = selected.iloc[periodEndIdx, 0]

            periodStartPrice = selected.iloc[periodStartIdx, 1]
            periodEndPrice = selected.iloc[periodEndIdx, 1]

            f_periodPriceChange = percentChange(periodStartPrice, periodEndPrice)
            f_fromStartRangeToPeriodEndPriceChange = percentChange(rangeStartPrice, periodEndPrice)

            averageVolumeOverPeriod = selected.iloc[periodStartIdx:periodEndIdx+1, 2].mean()
            f_averageVolumeRatio = averageVolumeOverPeriod / averageVolumeOverRange

            # print("   #{}: {} - {}, f_periodPriceChange:{}, f_fromStartRangeToPeriodEndPriceChange:{}, f_averageVolumeRatio:{}".\
            #      format(preceedingPeriodIdx, periodStartDate, periodEndDate, f_periodPriceChange, f_fromStartRangeToPeriodEndPriceChange, f_averageVolumeRatio))

            rangeFeatures.append(f_periodPriceChange)
            rangeFeatures.append(f_fromStartRangeToPeriodEndPriceChange)
            rangeFeatures.append(f_averageVolumeRatio)
        
        checkBadNumbers(rangeName, rangeFeatures)
        
        ## CREATE THE LABELS INTO 'labels'
        rangeLabels = []
        labels.append(rangeLabels)
        featureSetEndPrice = selected.iloc[featureSetIdx, 1]
        rangeLabels.append(percentChange(featureSetEndPrice, selected.iloc[featureSetIdx+5, 1]))
        rangeLabels.append(percentChange(featureSetEndPrice, selected.iloc[featureSetIdx+10, 1]))
        rangeLabels.append(percentChange(featureSetEndPrice, selected.iloc[featureSetIdx+15, 1]))
        rangeLabels.append(percentChange(featureSetEndPrice, selected.iloc[featureSetIdx+20, 1]))
        
        checkBadNumbers(rangeName, rangeLabels)
    
    # Return last feature set date
    return featureSetDate

In [4]:
from timeit import default_timer as timer

rangeNames = []
features = []
labels = []

total_start = timer()
idx_ticker = 0
for ticker, path in namePathDict.items():
    # DEBUG::
    idx_ticker += 1
    if (idx_ticker > 10000):
        print ("EARLY BREAK!")
        break
    # DEBUG END!
    start = timer()
    df = readTickerAsPandas(ticker)
    lastFeatureSetDate = featurizeTicker(rangeNames, features, labels, ticker, df)
    millis = (timer()-start) * 1000
    print ("{}: {}, last:{} :: took {} ms - current length: {}".\
           format(ticker, df.shape, lastFeatureSetDate, millis, len(features)))

print("Total time: {}".format((timer() - total_start)*1000))


QTM: (4499, 14), last:2017-05-15 :: took 24358.535848999963 ms - current length: 1367
HTGC: (3027, 14), last:2017-05-17 :: took 16102.092798999365 ms - current length: 2244
ED: (11974, 14), last:2017-05-16 :: took 70350.52233500028 ms - current length: 6103
FIVN: (807, 14), last:2017-05-17 :: took 2512.0386479993613 ms - current length: 6240
CVEO: (778, 14), last:2017-05-16 :: took 2375.6534879994433 ms - current length: 6367
NCI: (5210, 14), last:2017-05-15 :: took 28853.691264999725 ms - current length: 7971
LDOS: (2685, 14), last:2017-05-17 :: took 13644.446127000265 ms - current length: 8734
IGTE: (4665, 14), last:2015-06-02 :: took 25788.426041001003 ms - current length: 10157
AAPL: (9207, 14), last:2017-05-17 :: took 52911.62616600013 ms - current length: 13094
HEOP: (4515, 14), last:2017-03-02 :: took 24599.45044199958 ms - current length: 14467
BRKR: (4243, 14), last:2017-05-16 :: took 24240.21599200023 ms - current length: 15749
PBPB: (932, 14), last:2017-05-15 :: took 3246.07

FRGI: (1294, 14), last:2017-05-16 :: took 5493.324494000262 ms - current length: 130939
HSY: (8058, 14), last:2017-05-17 :: took 46433.557164000376 ms - current length: 133493
ZAGG: (2465, 14), last:2017-05-15 :: took 12326.90154699958 ms - current length: 134182
ARRS: (5981, 14), last:2017-05-15 :: took 33531.89162600029 ms - current length: 136043
BRSS: (1025, 14), last:2017-05-15 :: took 3665.3921060024004 ms - current length: 136252
ADVS: (4942, 14), last:2015-06-04 :: took 27369.84074800057 ms - current length: 137767
CLH: (6862, 14), last:2017-05-16 :: took 38973.88768900055 ms - current length: 139922
MCD: (11974, 14), last:2017-05-16 :: took 71378.51284499993 ms - current length: 143781
GES: (5250, 14), last:2017-05-17 :: took 29375.873494998814 ms - current length: 145399
EBIO: (847, 14), last:2017-05-16 :: took 2778.248505001102 ms - current length: 145549
AMZN: (5055, 14), last:2017-05-17 :: took 27827.448302999983 ms - current length: 147102
NUS: (5175, 14), last:2017-05-17

DWA: (2975, 14), last:2016-07-20 :: took 15100.804961999529 ms - current length: 259520
HCC: (5867, 14), last:2017-05-15 :: took 31764.870311002596 ms - current length: 261343
TRNX: (1173, 14), last:2015-09-01 :: took 4510.479757998837 ms - current length: 261602
RFMD: (4425, 14), last:2014-12-01 :: took 23337.837036000565 ms - current length: 262945
LKFN: (4993, 14), last:2017-05-16 :: took 26772.146699000587 ms - current length: 264477
MTH: (7182, 14), last:2017-05-17 :: took 39608.52923499988 ms - current length: 266739
EDIG: (5514, 14), last:2015-02-04 :: took 29652.62828900086 ms - current length: 268445
VVI: (3235, 14), last:2017-05-16 :: took 16412.705074999394 ms - current length: 269391
SB: (2279, 14), last:2017-05-15 :: took 10937.738832999457 ms - current length: 270018
KMG: (5131, 14), last:2017-05-16 :: took 27528.74263999911 ms - current length: 271596
ARR: (2402, 14), last:2017-05-15 :: took 11652.453602000605 ms - current length: 272264
TSLA: (1755, 14), last:2017-05-17

MTD: (4928, 14), last:2017-05-15 :: took 26369.121476000146 ms - current length: 384855
VSEC: (5496, 14), last:2017-05-17 :: took 29779.759171997284 ms - current length: 386555
BIDU: (2987, 14), last:2017-05-15 :: took 14984.847566000099 ms - current length: 387418
HNR: (6927, 14), last:2017-04-04 :: took 38000.16027399761 ms - current length: 389595
PRFT: (4501, 14), last:2017-05-16 :: took 23883.670405000885 ms - current length: 390963
GRPN: (1412, 14), last:2017-05-15 :: took 5910.424163997959 ms - current length: 391301
DRI: (5567, 14), last:2017-05-15 :: took 30011.14751699788 ms - current length: 393024
NLY: (4955, 14), last:2017-05-15 :: took 26653.95897900089 ms - current length: 394543
RPM: (6862, 14), last:2017-05-16 :: took 37512.453944997105 ms - current length: 396698
TECH: (6176, 14), last:2017-05-15 :: took 33531.86969399758 ms - current length: 398624
NYT: (11132, 14), last:2017-05-15 :: took 62452.03702999788 ms - current length: 402202
MTSI: (1323, 14), last:2017-05-1

TREE: (2228, 14), last:2017-05-15 :: took 10667.766490998474 ms - current length: 516039
BCOR: (4656, 14), last:2017-05-17 :: took 24875.496656000905 ms - current length: 517459
PCTY: (819, 14), last:2017-05-17 :: took 2447.935788997711 ms - current length: 517600
SMTC: (6362, 14), last:2017-05-15 :: took 35057.447635001154 ms - current length: 519588
UFPT: (5917, 14), last:2017-05-16 :: took 32448.739586001466 ms - current length: 521428
MCHX: (3327, 14), last:2017-05-17 :: took 17649.446490999253 ms - current length: 522405
EVR: (2730, 14), last:2017-05-17 :: took 13978.323457002261 ms - current length: 523183
TSYS: (3908, 14), last:2016-01-20 :: took 20681.51919199954 ms - current length: 524353
BIO: (9409, 14), last:2017-05-16 :: took 53250.45934199807 ms - current length: 527357
SAFM: (6927, 14), last:2017-05-17 :: took 38301.14449100074 ms - current length: 529534
MU: (7079, 14), last:2017-05-15 :: took 39278.914759001054 ms - current length: 531761
MKTO: (819, 14), last:2016-07-

BCOV: (1341, 14), last:2017-05-17 :: took 5506.880668999656 ms - current length: 652537
MNKD: (3246, 14), last:2017-05-17 :: took 16601.127434001683 ms - current length: 653487
URG: (2240, 14), last:2017-05-15 :: took 10688.561222999851 ms - current length: 654101
CUB: (7138, 14), last:2017-05-16 :: took 39174.075615999755 ms - current length: 656348
DRL: (5797, 14), last:2015-01-27 :: took 31523.3947859997 ms - current length: 658148
CSFL: (4121, 14), last:2017-05-15 :: took 21737.84968399923 ms - current length: 659389
PCAR: (7801, 14), last:2017-05-16 :: took 42991.619222000736 ms - current length: 661857
NPO: (3793, 14), last:2017-05-16 :: took 19761.52376399841 ms - current length: 662989
IMI: (1402, 14), last:2017-05-16 :: took 5835.789892000321 ms - current length: 663324
NBS: (4094, 14), last:2015-05-04 :: took 21417.177309998806 ms - current length: 664556
BXC: (3149, 14), last:2017-05-15 :: took 15945.208631001151 ms - current length: 665473
FL: (11974, 14), last:2017-05-16 :

SKT: (6058, 14), last:2017-05-16 :: took 33085.3044889991 ms - current length: 788738
RJF: (7427, 14), last:2017-05-15 :: took 40847.39835200162 ms - current length: 791081
MN: (1402, 14), last:2017-05-16 :: took 5856.880287999957 ms - current length: 791416
WDFC: (6862, 14), last:2017-05-16 :: took 37647.913679997146 ms - current length: 793571
NM: (3133, 14), last:2017-05-16 :: took 16109.878740997374 ms - current length: 794483
CFG: (688, 14), last:2017-05-16 :: took 1765.1755029983178 ms - current length: 794580
AEGR: (1537, 14), last:2016-10-27 :: took 6736.151826000423 ms - current length: 794960
AGYS: (6862, 14), last:2017-05-16 :: took 38192.36638000075 ms - current length: 797115
SUBK: (5996, 14), last:2015-11-13 :: took 32965.879257997585 ms - current length: 798981
SCHN: (5939, 14), last:2017-05-15 :: took 32145.564109003317 ms - current length: 800828
SKUL: (1311, 14), last:2016-09-01 :: took 5343.943700001546 ms - current length: 801133
DPS: (2295, 14), last:2017-05-17 :: 

UBSH: (5496, 14), last:2017-05-17 :: took 29726.931018998584 ms - current length: 926764
ROIAK: (4255, 14), last:2017-04-04 :: took 22595.416283002123 ms - current length: 928050
HSII: (4566, 14), last:2017-05-17 :: took 24300.8603140006 ms - current length: 929440
KCG: (1000, 14), last:2017-05-16 :: took 3497.5625689985463 ms - current length: 929641
DECK: (5961, 14), last:2017-05-17 :: took 32515.436500998476 ms - current length: 931496
WTM: (7842, 14), last:2017-05-17 :: took 43190.892192997126 ms - current length: 933978
EXAC: (5299, 14), last:2017-05-16 :: took 28667.282562000764 ms - current length: 935612
ORBC: (2671, 14), last:2017-05-16 :: took 13239.881546000106 ms - current length: 936370
WTSL: (5769, 14), last:2014-12-23 :: took 31404.519947001972 ms - current length: 938161
AROW: (8264, 14), last:2017-05-15 :: took 45816.85603600272 ms - current length: 940783
WFM: (6399, 14), last:2017-05-17 :: took 34827.895123002236 ms - current length: 942784
PRA: (6497, 14), last:2017

BadNumberInArrayError: Got a bad number in this array: ['GMT', '2001-08-27'] -> [-2.1489255372311682, nan, -23.913043478261294, -15.942028985506226]

In [6]:
result = {'rangeNames': rangeNames, 'features': features, 'labels': labels}

In [7]:
import pickle

pickle.dump(result, open("RangeNamesFeaturesAndLabels.pickle", "wb"))

In [5]:
len(rangeNames)

1008922