In [1]:
%%bash
# Install Jupyter kernel in the virtual environment
source .env/bin/activate 
uv pip install ipykernel -q

# Install custom kernel
python -m ipykernel install --user --name=mlops --display-name="Python (practice)"

Installed kernelspec mlops in /home/jupyter/.local/share/jupyter/kernels/mlops


In [1]:
import os
import warnings

# Setup environment path for consistent package management
os.environ['PATH'] = os.path.abspath('.env/bin') + ':' + os.environ.get('PATH', '')

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")
%env PYTHONWARNINGS=ignore
%env JUPYTER_PLATFORM_DIRS=1

env: JUPYTER_PLATFORM_DIRS=1


In [2]:
# Import required libraries
import pandas as pd
from datetime import datetime, timedelta

# Load the Iris dataset
data = pd.read_csv('data/housing.csv')
print(f"Original dataset shape: {data.shape}")

# Add timestamps for point-in-time feature serving
# Create timestamps spaced 5 minutes apart for each record
start_date = datetime.now()
timestamps = [start_date + timedelta(minutes=i*5) for i in range(len(data))]
data['event_timestamp'] = timestamps

print("Dataset with timestamps:")
data.head(10)

Original dataset shape: (20640, 10)
Dataset with timestamps:


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,event_timestamp
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,2025-07-18 15:21:36.889108
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,2025-07-18 15:26:36.889108
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,2025-07-18 15:31:36.889108
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,2025-07-18 15:36:36.889108
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,2025-07-18 15:41:36.889108
5,-122.25,37.85,52.0,919.0,213.0,413.0,193.0,4.0368,269700.0,NEAR BAY,2025-07-18 15:46:36.889108
6,-122.25,37.84,52.0,2535.0,489.0,1094.0,514.0,3.6591,299200.0,NEAR BAY,2025-07-18 15:51:36.889108
7,-122.25,37.84,52.0,3104.0,687.0,1157.0,647.0,3.12,241400.0,NEAR BAY,2025-07-18 15:56:36.889108
8,-122.26,37.84,42.0,2555.0,665.0,1206.0,595.0,2.0804,226700.0,NEAR BAY,2025-07-18 16:01:36.889108
9,-122.25,37.84,52.0,3549.0,707.0,1551.0,714.0,3.6912,261100.0,NEAR BAY,2025-07-18 16:06:36.889108


In [4]:
data["entity_id"]=data.index

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   longitude           20640 non-null  float64       
 1   latitude            20640 non-null  float64       
 2   housing_median_age  20640 non-null  float64       
 3   total_rooms         20640 non-null  float64       
 4   total_bedrooms      20433 non-null  float64       
 5   population          20640 non-null  float64       
 6   households          20640 non-null  float64       
 7   median_income       20640 non-null  float64       
 8   median_house_value  20640 non-null  float64       
 9   ocean_proximity     20640 non-null  object        
 10  event_timestamp     20640 non-null  datetime64[ns]
 11  entity_id           20640 non-null  int64         
dtypes: datetime64[ns](1), float64(9), int64(1), object(1)
memory usage: 1.9+ MB


In [8]:
data[0:10000].shape

(10000, 12)

In [9]:
filtered_data = data[0:10000]

# Create entity dataframe with species and timestamps
# This will be used for getting historical features
result = filtered_data[['entity_id', 'event_timestamp']]

# Save entity dataframe for later use in model training
result.to_csv("data/entity.csv", index=False)
print(f"Entity dataframe created with {len(result)} records")
print("Entity dataframe preview:")
result.head()

Entity dataframe created with 10000 records
Entity dataframe preview:


Unnamed: 0,entity_id,event_timestamp
0,0,2025-07-18 15:21:36.889108
1,1,2025-07-18 15:26:36.889108
2,2,2025-07-18 15:31:36.889108
3,3,2025-07-18 15:36:36.889108
4,4,2025-07-18 15:41:36.889108


In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   longitude           20640 non-null  float64       
 1   latitude            20640 non-null  float64       
 2   housing_median_age  20640 non-null  float64       
 3   total_rooms         20640 non-null  float64       
 4   total_bedrooms      20433 non-null  float64       
 5   population          20640 non-null  float64       
 6   households          20640 non-null  float64       
 7   median_income       20640 non-null  float64       
 8   median_house_value  20640 non-null  float64       
 9   ocean_proximity     20640 non-null  object        
 10  event_timestamp     20640 non-null  datetime64[ns]
 11  entity_id           20640 non-null  int64         
dtypes: datetime64[ns](1), float64(9), int64(1), object(1)
memory usage: 1.9+ MB


In [19]:
import pandas_gbq

table_schema = [
    {'name': 'longitude', 'type': 'FLOAT'},
    {'name': 'latitude', 'type': 'FLOAT'}, 
    {'name': 'housing_median_age', 'type': 'FLOAT'},
    {'name': 'total_rooms', 'type': 'FLOAT'},
    {'name': 'total_bedrooms', 'type': 'FLOAT'},
    {'name': 'population', 'type': 'FLOAT'},
    {'name': 'households', 'type': 'FLOAT'},
    {'name': 'median_income', 'type': 'FLOAT'},
    {'name': 'median_house_value', 'type': 'FLOAT'},
    {'name': 'ocean_proximity', 'type': 'STRING'},
    {'name': 'event_timestamp', 'type': 'TIMESTAMP'},
    {'name': 'entity_id', 'type': 'INTEGER'},
]

# Upload dataframe to BigQuery
# if_exists="replace" will overwrite existing table
pandas_gbq.to_gbq(
    data, 
    "PRACTICE.housing", 
    project_id="arcane-rigging-461217-m1", 
    if_exists="replace",
    table_schema=table_schema
)

print(f"Data successfully uploaded to BigQuery table: {'PRACTICE.housing'}")
print(f"Table contains {len(data)} records")

100%|██████████| 1/1 [00:00<00:00, 9425.40it/s]

Data successfully uploaded to BigQuery table: PRACTICE.housing
Table contains 20640 records





In [20]:
%%bash
# Initialize the Feast repository with GCP template
feast init -m Feast -t gcp

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.



Creating a new Feast repository in /home/jupyter/Feast.



In [21]:
%cd Feast/feature_repo

/home/jupyter/Feast/feature_repo


In [22]:
# Create feature store configuration
# This configures Feast to use BigQuery as offline store and Datastore as online store
feature_store = f"""project: Feast
registry: gs://practice-oppe-arcane-rigging-461217-m1/feast/registry.db
provider: gcp
entity_key_serialization_version: 2

offline_store:
  type: bigquery
  dataset: {'PRACTICE_OPPE'}

online_store:
  type: datastore
  project_id: {'arcane-rigging-461217-m1'}
  namespace: {'PRACTICE_OPPE_ONLINE'}
"""

# Write configuration to feature_store.yaml
with open('feature_store.yaml', "w") as feature_store_file:
    feature_store_file.write(feature_store)
    
print("Feature store configuration created successfully!")

Feature store configuration created successfully!


In [25]:
# Creates definitions of entity, feature view, and feature service
flower_features = f"""
from datetime import timedelta
from feast import BigQuerySource, FeatureView, FeatureService, Entity, ValueType

# Define flower species as entity
flower_entity = Entity(
    name="entity_id",
    description="A ",
    value_type=ValueType.INT64
)

# Define feature view for flower measurements
flower_features = FeatureView(
    name="flower_features",
    entities=[flower_entity],
    ttl=timedelta(weeks=52),  # Time-to-live for features
    source=BigQuerySource(
        table=f"PRACTICE.housing",
        timestamp_field="event_timestamp"
    ),
    tags={{"assignment":"week_3"}}
)

# Create feature service for one model version
# FeatureService groups features for specific use cases
model_v1 = FeatureService(
    name="feast_model_v1",
    features=[flower_features]
)
"""

# Write feature definitions to feature_repo.py
with open('feature_repo.py', "w") as feature_repo_file:
    feature_repo_file.write(flower_features)
    
print("Feature repository definitions created successfully!")

Feature repository definitions created successfully!


In [26]:
!feast apply

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
No project found in the repository. Using project name Feast defined in feature_store.yaml
Applying changes for project Feast
Deploying infrastructure for [1m[32mflower_features[0m


In [None]:
!feast materialize 2025-06-20 2025-07-20