In [59]:
import yfinance as yf
import pandas as pd
import numpy as np

In [60]:
def applyVolumeNormalizedVectorized(row, avgVol):
    row['VolumeNormalized'] = (row['Volume'] - avgVol) / avgVol
    return row

def applyMovementNormalized(row, avgMovement):
    row['MovemementNormalize'] = (row['Close'] - row['Open']) / avgMovement
    return row


def getData_vectorized(ticker='SPY', interval='1d'):
    #gaterh the data
    dat = yf.Ticker(ticker)
    data = dat.history( period="10y", interval=interval)

    #vectorize the volume
    avgVol = data['Volume'].mean()
    #print(avgVol)
    data = data.apply(applyVolumeNormalizedVectorized, avgVol=avgVol,  axis=1)

    #vecotorize the movement
    avgMovement = (data['Close']-data['Open']).mean()
    data = data.apply(applyMovementNormalized, avgMovement=avgMovement, axis=1)
    
    return data

In [61]:
def createSlidingWindows(data, sliding_window=5):
    
    megaDb = pd.DataFrame(columns=['startingDate', 'vol1', 'mov1', 'vol2', 'mov2', 'vol3', 'mov3', 'vol4', 'mov4', 'vol5', 'mov5']) # new db for the vectors
    cleanDB = data[['VolumeNormalized', 'MovemementNormalize']]

    #print(cleanDB)
        
    for i in range(len(cleanDB) - (sliding_window-1)):
        ar = []
        startingDate = cleanDB.iloc[i].name
        ar.append(startingDate)
    

        for j in range(sliding_window):

            ar.append(cleanDB.iloc[i+j].VolumeNormalized)
            ar.append(cleanDB.iloc[i+j].MovemementNormalize)

        
        megaDb.loc[i] = ar
        #print(ar)

    return megaDb

In [66]:
def findSimilar(megaDb, search_vector):
    copy_db = megaDb.copy()
    copy_db.drop(columns=['startingDate'], inplace=True)

    diff_df = copy_db - search_vector.values
    norm_df = diff_df.apply(np.linalg.norm, axis=1)
    print(norm_df)


    #sort the megaDb by the norm_df
    megaDb['DIFF_SCORE'] = norm_df
    megaDb = megaDb.sort_values('DIFF_SCORE')
    print(megaDb)

    return megaDb

    
    

In [63]:
data = getData_vectorized()
slidingWindows_megaDB = createSlidingWindows(data)

slidingWindows_megaDB

Unnamed: 0,startingDate,vol1,mov1,vol2,mov2,vol3,mov3,vol4,mov4,vol5,mov5
0,2014-11-24 00:00:00-05:00,-0.253343,1.508151,-0.103430,-7.205780,-0.295424,5.865374,-0.343905,-4.859939,0.178321,-10.725057
1,2014-11-25 00:00:00-05:00,-0.103430,-7.205780,-0.295424,5.865374,-0.343905,-4.859939,0.178321,-10.725057,-0.155576,21.450116
2,2014-11-26 00:00:00-05:00,-0.295424,5.865374,-0.343905,-4.859939,0.178321,-10.725057,-0.155576,21.450116,-0.218536,9.887107
3,2014-11-28 00:00:00-05:00,-0.343905,-4.859939,0.178321,-10.725057,-0.155576,21.450116,-0.218536,9.887107,0.034932,2.011124
4,2014-12-01 00:00:00-05:00,0.178321,-10.725057,-0.155576,21.450116,-0.218536,9.887107,0.034932,2.011124,0.031633,2.178610
...,...,...,...,...,...,...,...,...,...,...,...
2508,2024-11-12 00:00:00-05:00,-0.512592,-35.579871,-0.462924,-3.597882,-0.559082,-79.357155,-0.138785,-79.355935,-0.579709,38.579936
2509,2024-11-13 00:00:00-05:00,-0.462924,-3.597882,-0.559082,-79.357155,-0.138785,-79.355935,-0.579709,38.579936,-0.439991,111.738095
2510,2024-11-14 00:00:00-05:00,-0.559082,-79.357155,-0.138785,-79.355935,-0.579709,38.579936,-0.439991,111.738095,-0.432958,2.398588
2511,2024-11-15 00:00:00-05:00,-0.138785,-79.355935,-0.579709,38.579936,-0.439991,111.738095,-0.432958,2.398588,-0.470158,5.396213


In [67]:
compareVect = pd.DataFrame(columns=[ 'vol1', 'mov1', 'vol2', 'mov2', 'vol3', 'mov3', 'vol4', 'mov4', 'vol5', 'mov5']) # new db for the vectors
compareVect.loc[0] = [-0, 38, -.43, 111, -.43, 2.39, -.47, 5, -.5, 36.0]


sortedSimilar = findSimilar(slidingWindows_megaDB, compareVect)

0       132.655396
1       116.657471
2       124.825064
3       134.897340
4       107.717355
           ...    
2508    179.867300
2509    226.964382
2510    252.666373
2511    178.654667
2512      1.529812
Length: 2513, dtype: float64
                  startingDate      vol1        mov1      vol2        mov2  \
2512 2024-11-18 00:00:00-05:00 -0.579709   38.579936 -0.439991  111.738095   
2277 2023-12-12 00:00:00-05:00 -0.225612   48.714229  0.057162  118.531534   
2396 2024-06-04 00:00:00-04:00 -0.607492   38.337065 -0.460410   77.468252   
1468 2020-09-24 00:00:00-04:00 -0.130936   43.004335 -0.194538  115.999030   
865  2018-05-03 00:00:00-04:00  0.544880    6.472786  0.033861   80.913122   
...                        ...       ...         ...       ...         ...   
1822 2022-02-18 00:00:00-05:00  0.503302  -59.502333  0.409789  -44.531273   
1985 2022-10-13 00:00:00-04:00  0.668902  325.423882  0.402367 -212.029937   
1864 2022-04-21 00:00:00-04:00 -0.031927 -201.783019  0.5013

In [68]:
sortedSimilar

Unnamed: 0,startingDate,vol1,mov1,vol2,mov2,vol3,mov3,vol4,mov4,vol5,mov5,DIFF_SCORE
2512,2024-11-18 00:00:00-05:00,-0.579709,38.579936,-0.439991,111.738095,-0.432958,2.398588,-0.470158,5.396213,-0.568277,36.980471,1.529812
2277,2023-12-12 00:00:00-05:00,-0.225612,48.714229,0.057162,118.531534,0.348976,-9.663760,0.601636,-3.168443,-0.202405,19.604097,25.583059
2396,2024-06-04 00:00:00-04:00,-0.607492,38.337065,-0.460410,77.468252,-0.650833,-6.356575,-0.510117,6.953072,-0.595064,49.261935,37.163384
1468,2020-09-24 00:00:00-04:00,-0.130936,43.004335,-0.194538,115.999030,-0.268033,18.295737,-0.418549,-30.178604,0.179598,33.951148,39.310577
865,2018-05-03 00:00:00-04:00,0.544880,6.472786,0.033861,80.913122,-0.373205,0.539399,-0.235001,7.552135,-0.323777,32.725000,43.822948
...,...,...,...,...,...,...,...,...,...,...,...,...
1822,2022-02-18 00:00:00-05:00,0.503302,-59.502333,0.409789,-44.531273,0.502567,-205.572438,1.424712,331.680100,0.380466,156.243133,445.117455
1985,2022-10-13 00:00:00-04:00,0.668902,325.423882,0.402367,-212.029937,0.055917,54.560853,0.101191,-77.666762,-0.096192,-9.513989,445.634164
1864,2022-04-21 00:00:00-04:00,-0.031927,-201.783019,0.501363,-209.291807,0.356022,93.189699,0.178637,-187.341869,0.383021,0.577599,454.510892
1863,2022-04-20 00:00:00-04:00,-0.260782,-42.551944,-0.031927,-201.783019,0.501363,-209.291807,0.356022,93.189699,0.178637,-187.341869,454.744182
