In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
!printf feast_speed_testing/**\\ndata/**\\n.*\\n*.ipynb\\nregistry.db > .feastignore

In [None]:
%pip install 'feast[postgres, redis]' psycopg2

In [None]:
!feast apply

In [None]:
from numpy import arange, random
from datetime import datetime

addresses = arange(0, 10000)
num_beds = random.randint(1, 10, 10000)
norm_basement_sq_ft = random.rand(10000) * 1.1
timestamps = [datetime.now() for i in range(10000)]

In [None]:
from pandas import DataFrame

data = DataFrame()
data["address_id"] = addresses
data["num_beds"] = num_beds
data["norm_basement_sq_ft"] = norm_basement_sq_ft
data["event_timestamp"] = timestamps
data["created"] = timestamps

In [None]:
from os import getenv
from sqlalchemy import create_engine, engine

connection_string = engine.URL.create(
    drivername="postgresql",
    username=getenv('DB_USERNAME'),
    password=getenv('DB_PASSWORD'),
    host=getenv('DB_HOST'),
    database=getenv('DB_NAME'),
)

this_engine = create_engine(connection_string)
data.to_sql('address_values', schema="feast", con=this_engine, if_exists="append", index=False)

In [None]:
from psycopg2 import connect

try:
    conn = connect(dbname=getenv('DB_NAME'), user=getenv('DB_USERNAME'), host=getenv('DB_HOST'), password=getenv('DB_PASSWORD'))
except:
    print("I am unable to connect to the database")

with conn.cursor() as curs:
    curs.execute("SELECT count(1) from feast.address_values where address_id < 1000")
    single_row = curs.fetchone()
    print(single_row)

conn.close()

In [None]:
!feast materialize 2024-09-01T00:00:00 2026-01-01T00:00:00

In [None]:
from feast import FeatureStore

store = FeatureStore(repo_path=".")

In [None]:
from pandas import DataFrame
from datetime import datetime

entity_df = DataFrame.from_dict(
    {
        "address_id": [1001, 1002, 1003, 1004, 1001],
        "event_timestamp": [
            datetime(2021, 4, 12, 10, 59, 42),
            datetime(2021, 4, 12, 8, 12, 10),
            datetime(2021, 4, 12, 16, 40, 26),
            datetime(2021, 4, 12, 15, 1, 12),
            datetime.now()
        ]
    }
)
training_df = store.get_historical_features(
    entity_df=entity_df,
    features=[
        "address_data:num_beds",
        "address_data:norm_basement_sq_ft",
    ],
).to_df()
print(training_df.head())

In [None]:
from property_repo import address_v1

features = store.get_online_features(
    features=address_v1,
    entity_rows=[
        {
            "address_id": 1001,
            "home_sq_ft": 1500
        }
    ],
).to_dict()

def print_online_features(features):
    for key, value in sorted(features.items()):
        print(key, " : ", value)

print_online_features(features)

In [None]:
features = store.get_online_features(
    features=[
        "address_data:norm_basement_sq_ft",
    ],
    entity_rows=[
        {
            "address_id": 1001,
        }
    ],
).to_dict()

def print_online_features(features):
    for key, value in sorted(features.items()):
        print(key, " : ", value)

print_online_features(features)

## Everything below this point is used to test the speed once you have validated that everything else is functioning correctly.

In [None]:
from feast import FeatureStore
from numpy import random
from property_repo import address_v1
from pandas import DataFrame

store = FeatureStore(repo_path=".")

In [None]:
def run():
    address_id = random.randint(0, 10000, 1)[0]

    features = store.get_online_features(
        features=address_v1,
        entity_rows=[
            {
                "address_id": address_id,
                "home_sq_ft": 1500,
                "num_beds": 1,
                "basement_sq_ft": 500
            }
        ],
    ).to_df()
    
def run_straight_retrieval():
    address_id = random.randint(0, 10000, 1)[0]

    features = store.get_online_features(
        features=[
            "address_data:norm_basement_sq_ft"
        ],
        entity_rows=[
            {
                "address_id": address_id,
                "home_sq_ft": 1500,
                "num_beds": 1,
                "basement_sq_ft": 500
            }
        ],
    ).to_df()


def run_in_memory():
    address_id = random.randint(0, 10000, 1)[0]

    entity_rows=[
        {
            "address_id": address_id,
            "home_sq_ft": 1500,
            "num_beds": 1,
            "basement_sq_ft": 500
        }
    ]
    
    entity_df = DataFrame(entity_rows)
    
    new_data = DataFrame()
    new_data["norm_home_sq_ft"] = entity_df["home_sq_ft"] / 3500
    new_data["norm_num_beds"] = entity_df["num_beds"] / 10
    new_data["norm_basement_sq_ft"] = entity_df["basement_sq_ft"] / 1500

    
def run_straight_retrieval_in_memory():
    address_id = random.randint(0, 10000, 1)[0]

    entity_rows=[
        {
            "address_id": address_id,
            "home_sq_ft": 1500,
            "num_beds": 1,
            "basement_sq_ft": 500
        }
    ]
    
    entity_df = DataFrame(entity_rows)

    new_data = DataFrame()
    new_data["norm_basement_sq_ft"] = entity_df["basement_sq_ft"] / 1500

In [None]:
from timeit import timeit
from numpy.random import seed

seed(1235124)

# Try all 4 combinations of run, run_straight_retrieval, run_in_memory, and run_straight_retrieval_in_memory.
timeit(lambda: run(), number=10000)  # Don't go higher than 10,000 - it really starts to struggle.