In [59]:
import yfinance as yf
import pandas as pd
import numpy as np

In [60]:
def applyVolumeNormalizedVectorized(row, avgVol):
    row['VolumeNormalized'] = (row['Volume'] - avgVol) / avgVol
    return row

def applyMovementNormalized(row, avgMovement):
    row['MovemementNormalize'] = (row['Close'] - row['Open']) / avgMovement
    return row


def getData_vectorized(ticker='SPY', interval='1d'):
    #gaterh the data
    dat = yf.Ticker(ticker)
    data = dat.history( period="10y", interval=interval)

    #vectorize the volume
    avgVol = data['Volume'].mean()
    #print(avgVol)
    data = data.apply(applyVolumeNormalizedVectorized, avgVol=avgVol,  axis=1)

    #vecotorize the movement
    avgMovement = (data['Close']-data['Open']).mean()
    data = data.apply(applyMovementNormalized, avgMovement=avgMovement, axis=1)
    
    return data

In [61]:
def createSlidingWindows(data, sliding_window=5):
    
    megaDb = pd.DataFrame(columns=['startingDate', 'vol1', 'mov1', 'vol2', 'mov2', 'vol3', 'mov3', 'vol4', 'mov4', 'vol5', 'mov5']) # new db for the vectors
    cleanDB = data[['VolumeNormalized', 'MovemementNormalize']]

    #print(cleanDB)
        
    for i in range(len(cleanDB) - (sliding_window-1)):
        ar = []
        startingDate = cleanDB.iloc[i].name
        ar.append(startingDate)
    

        for j in range(sliding_window):

            ar.append(cleanDB.iloc[i+j].VolumeNormalized)
            ar.append(cleanDB.iloc[i+j].MovemementNormalize)

        
        megaDb.loc[i] = ar
        #print(ar)

    return megaDb

In [84]:
def window_1(startDate, ticker='SPY', interval='1d', sliding_window=5):
    data = getData_vectorized(ticker, interval)
    filtered_df = data[data.index >= pd.Timestamp(startDate, tz='America/New_York')]


    if(len(filtered_df) < sliding_window):
        print("Not enough data")
        return

    filtered_df = filtered_df[0:sliding_window]

    db_row = createSlidingWindows(filtered_df, sliding_window)
    db_row.drop(columns=['startingDate'], inplace=True)
    return db_row    

In [None]:
def findSimilar(megaDb, search_vector):
    copy_db = megaDb.copy()
    copy_db.drop(columns=['startingDate'], inplace=True)

    diff_df = copy_db - search_vector.values
    norm_df = diff_df.apply(np.linalg.norm, axis=1)
    print(norm_df)


    #sort the megaDb by the norm_df
    megaDb['DIFF_SCORE'] = norm_df
    megaDb = megaDb.sort_values('DIFF_SCORE')
    print(megaDb)

    return megaDb

In [None]:
data = getData_vectorized()
slidingWindows_megaDB = createSlidingWindows(data)

compareVect=window_1('2024-09-21')

sortedSimilar = findSimilar(slidingWindows_megaDB, compareVect)

Unnamed: 0,startingDate,vol1,mov1,vol2,mov2,vol3,mov3,vol4,mov4,vol5,mov5
0,2014-11-24 00:00:00-05:00,-0.253343,1.508151,-0.103430,-7.205780,-0.295424,5.865374,-0.343905,-4.859939,0.178321,-10.725057
1,2014-11-25 00:00:00-05:00,-0.103430,-7.205780,-0.295424,5.865374,-0.343905,-4.859939,0.178321,-10.725057,-0.155576,21.450116
2,2014-11-26 00:00:00-05:00,-0.295424,5.865374,-0.343905,-4.859939,0.178321,-10.725057,-0.155576,21.450116,-0.218536,9.887107
3,2014-11-28 00:00:00-05:00,-0.343905,-4.859939,0.178321,-10.725057,-0.155576,21.450116,-0.218536,9.887107,0.034932,2.011124
4,2014-12-01 00:00:00-05:00,0.178321,-10.725057,-0.155576,21.450116,-0.218536,9.887107,0.034932,2.011124,0.031633,2.178610
...,...,...,...,...,...,...,...,...,...,...,...
2508,2024-11-12 00:00:00-05:00,-0.512592,-35.579871,-0.462924,-3.597882,-0.559082,-79.357155,-0.138785,-79.355935,-0.579709,38.579936
2509,2024-11-13 00:00:00-05:00,-0.462924,-3.597882,-0.559082,-79.357155,-0.138785,-79.355935,-0.579709,38.579936,-0.439991,111.738095
2510,2024-11-14 00:00:00-05:00,-0.559082,-79.357155,-0.138785,-79.355935,-0.579709,38.579936,-0.439991,111.738095,-0.432958,2.398588
2511,2024-11-15 00:00:00-05:00,-0.138785,-79.355935,-0.579709,38.579936,-0.439991,111.738095,-0.432958,2.398588,-0.470158,5.396213
