In [125]:
import yfinance as yf
import pandas as pd
import numpy as np

In [126]:
def applyVolumeNormalizedVectorized(row, avgVol):
    row['VolumeNormalized'] = (row['Volume'] - avgVol) / avgVol
    return row

def applyMovementNormalized(row, avgMovement):
    row['MovemementNormalize'] = (row['Close'] - row['Open']) / avgMovement
    return row


def getData_vectorized(ticker, interval):
    #gaterh the data
    dat = yf.Ticker(ticker)
    data = dat.history( period="10y", interval=interval)

    #vectorize the volume
    avgVol = data['Volume'].mean()
    #print(avgVol)
    data = data.apply(applyVolumeNormalizedVectorized, avgVol=avgVol,  axis=1)

    #vecotorize the movement
    avgMovement = (data['Close']-data['Open']).mean()
    data = data.apply(applyMovementNormalized, avgMovement=avgMovement, axis=1)
    
    return data

In [127]:
def createSlidingWindows(data, sliding_window):
    
    col_titles= []
    col_titles.append('startingDate')
    for i in range(sliding_window):
        col_titles.append('vol'+str(i+1))
        col_titles.append('mov'+str(i+1))


    megaDb = pd.DataFrame(columns=col_titles) # new db for the vectors
    cleanDB = data[['VolumeNormalized', 'MovemementNormalize']]

    #print(cleanDB)
        
    for i in range(len(cleanDB) - (sliding_window-1)):
        ar = []
        startingDate = cleanDB.iloc[i].name
        ar.append(startingDate)
    

        for j in range(sliding_window):

            ar.append(cleanDB.iloc[i+j].VolumeNormalized)
            ar.append(cleanDB.iloc[i+j].MovemementNormalize)
       

        megaDb.loc[i] = ar
        print(ar)

        

    return megaDb

In [128]:
def window_1(startDate, ticker, interval, sliding_window):
    data = getData_vectorized(ticker, interval)
    filtered_df = data[data.index >= pd.Timestamp(startDate, tz='America/New_York')]


    if(len(filtered_df) < sliding_window):
        print("Not enough data")
        return

    filtered_df = filtered_df[0:sliding_window]

    db_row = createSlidingWindows(filtered_df, sliding_window)
    db_row.drop(columns=['startingDate'], inplace=True)
    return db_row    

In [129]:
def findSimilar(megaDb, search_vector):
    copy_db = megaDb.copy()
    copy_db.drop(columns=['startingDate'], inplace=True)

    diff_df = copy_db - search_vector.values
    norm_df = diff_df.apply(np.linalg.norm, axis=1)
    print(norm_df)


    #sort the megaDb by the norm_df
    megaDb['DIFF_SCORE'] = norm_df
    megaDb = megaDb.sort_values('DIFF_SCORE')
    print(megaDb)

    return megaDb

In [134]:
data = getData_vectorized(interval='1d', ticker='SPY')
slidingWindows_megaDB = createSlidingWindows(data, sliding_window=10)

compareVect=window_1('2024-09-03', ticker='SPY', interval='1d', sliding_window=10)

sortedSimilar = findSimilar(slidingWindows_megaDB, compareVect)

[Timestamp('2014-11-24 00:00:00-0500', tz='America/New_York'), -0.2533430447953147, 1.5081509728250535, -0.1034297942736153, -7.205780058702633, -0.29542415605496847, 5.8653728150974365, -0.3439052682648852, -4.859937841591013, 0.1783209824638256, -10.725057820145341, -0.1555761452073058, 21.45011084274644, -0.21853574371784404, 9.887106700591472, 0.034932400876191, 2.011123056967358, 0.03163323268667169, 2.178610822099877, 0.2306792689699792, -15.249761577959479]
[Timestamp('2014-11-25 00:00:00-0500', tz='America/New_York'), -0.1034297942736153, -7.205780058702633, -0.29542415605496847, 5.8653728150974365, -0.3439052682648852, -4.859937841591013, 0.1783209824638256, -10.725057820145341, -0.1555761452073058, 21.45011084274644, -0.21853574371784404, 9.887106700591472, 0.034932400876191, 2.011123056967358, 0.03163323268667169, 2.178610822099877, 0.2306792689699792, -15.249761577959479, 0.41872278901012167, 35.191724057992424]
[Timestamp('2014-11-26 00:00:00-0500', tz='America/New_York'),

In [135]:
sortedSimilar

Unnamed: 0,startingDate,vol1,mov1,vol2,mov2,vol3,mov3,vol4,mov4,vol5,...,mov6,vol7,mov7,vol8,mov8,vol9,mov9,vol10,mov10,DIFF_SCORE
2458,2024-09-03 00:00:00-04:00,-0.313192,-167.194332,-0.464779,14.945941,-0.498333,-25.508323,-0.223729,-190.909841,-0.541609,...,8.568860,-0.147174,113.987127,-0.411877,81.306260,-0.554476,45.833977,-0.584560,21.921443,0.000000
2497,2024-10-28 00:00:00-04:00,-0.658016,-34.980831,-0.513798,38.379848,-0.530389,-25.585345,-0.317924,-138.323860,-0.482430,...,-27.384896,-0.552575,119.135156,-0.227263,36.779162,-0.464685,50.571653,-0.473619,40.378264,161.598762
804,2018-02-05 00:00:00-05:00,2.339762,-170.493746,3.023680,164.583455,0.896949,-14.864204,1.793124,-185.895146,2.213774,...,27.042709,-0.079456,36.355180,0.368352,94.559309,0.260283,26.146936,0.818114,14.147691,173.429426
1401,2020-06-19 00:00:00-04:00,0.536245,-103.888533,-0.153965,49.408210,-0.223985,-27.052398,0.505236,-108.021559,0.013981,...,-114.784944,-0.095893,57.298166,0.285155,82.096306,-0.179498,17.846695,-0.214091,-37.760188,190.466767
1534,2020-12-29 00:00:00-05:00,-0.391615,-44.513722,-0.439501,-6.629801,-0.110089,39.778231,0.249069,-123.501361,-0.247162,...,72.737041,-0.220635,56.825875,-0.187650,12.691367,-0.421601,15.911179,-0.404453,-2.273521,196.415341
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1822,2022-02-18 00:00:00-05:00,0.503302,-59.502332,0.409789,-44.531272,0.502567,-205.572422,1.424712,331.680123,0.380466,...,88.294591,0.561590,-97.123873,0.334248,105.953753,0.195699,-91.365784,0.291767,8.061927,637.101401
1859,2022-04-13 00:00:00-04:00,-0.160527,101.661524,0.109199,-110.903109,-0.251964,22.334806,-0.118019,138.244761,-0.260782,...,-201.782989,0.501363,-209.291807,0.356022,93.189685,0.178637,-187.341841,0.383021,0.577599,639.145422
1982,2022-10-10 00:00:00-04:00,-0.138172,-76.501795,0.048149,-9.708345,-0.127417,-31.261179,0.668902,325.423857,0.402367,...,54.560844,0.101191,-77.666763,-0.096192,-9.513987,0.000552,-50.871631,0.485118,178.051283,646.028944
1823,2022-02-22 00:00:00-05:00,0.409789,-44.531272,0.502567,-205.572422,1.424712,331.680123,0.380466,156.243121,0.650321,...,-97.123873,0.334248,105.953753,0.195699,-91.365784,0.291767,8.061927,0.562845,-232.636661,649.534100
