In [94]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [95]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
import src.config as config

In [96]:
from src.inference import get_feature_store

In [97]:
from datetime import datetime, timedelta
import pandas as pd  

# Get the current datetime64[us, Etc/UTC]  
current_date = pd.Timestamp.now(tz='Etc/UTC')
feature_store = get_feature_store()

# read time-series data from the feature store
fetch_data_to = current_date - timedelta(hours=1)
fetch_data_from = current_date - timedelta(days=1*29)
print(f"Fetching data from {fetch_data_from} to {fetch_data_to}")
feature_view = feature_store.get_feature_view(
    name=config.FEATURE_VIEW_NAME, version=config.FEATURE_VIEW_VERSION
)

ts_data = feature_view.get_batch_data(
    start_time=(fetch_data_from - timedelta(days=1)),
    end_time=(fetch_data_to + timedelta(days=1)),
)
ts_data = ts_data[ts_data.pickup_hour.between(fetch_data_from, fetch_data_to)]

2025-03-04 11:05:42,452 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-03-04 11:05:42,452 INFO: Initializing external client
2025-03-04 11:05:42,452 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-03-04 11:05:43,047 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1215673
Fetching data from 2025-02-03 16:05:42.452369+00:00 to 2025-03-04 15:05:42.452369+00:00
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (4.95s) 


In [98]:
ts_data.sort_values(["pickup_location_id", "pickup_hour"]).reset_index(drop=True)

Unnamed: 0,pickup_hour,pickup_location_id,rides
0,2025-02-04 04:00:00+00:00,2,0
1,2025-02-04 05:00:00+00:00,2,0
2,2025-02-04 06:00:00+00:00,2,0
3,2025-02-04 07:00:00+00:00,2,0
4,2025-02-04 08:00:00+00:00,2,0
...,...,...,...
171679,2025-03-04 11:00:00+00:00,263,98
171680,2025-03-04 12:00:00+00:00,263,100
171681,2025-03-04 13:00:00+00:00,263,100
171682,2025-03-04 14:00:00+00:00,263,146


In [99]:
ts_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 171684 entries, 0 to 171683
Data columns (total 3 columns):
 #   Column              Non-Null Count   Dtype                  
---  ------              --------------   -----                  
 0   pickup_hour         171684 non-null  datetime64[us, Etc/UTC]
 1   pickup_location_id  171684 non-null  int32                  
 2   rides               171684 non-null  int32                  
dtypes: datetime64[us, Etc/UTC](1), int32(2)
memory usage: 2.6 MB


In [100]:
ts_data["pickup_hour"] = ts_data["pickup_hour"].dt.tz_localize(None)

In [101]:
ts_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 171684 entries, 0 to 171683
Data columns (total 3 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   pickup_hour         171684 non-null  datetime64[us]
 1   pickup_location_id  171684 non-null  int32         
 2   rides               171684 non-null  int32         
dtypes: datetime64[us](1), int32(2)
memory usage: 2.6 MB


In [102]:
import pandas as pd
from src.data_utils import transform_ts_data_info_features_and_target

# Assuming ts_data is already defined and loaded
# First, let's analyze data availability per location
location_data_counts = ts_data.groupby('pickup_location_id').size()
print(f"Locations with data: {len(location_data_counts)}")

# Calculate optimal window size based on data availability
min_records = location_data_counts.min()
median_records = location_data_counts.median()
print(f"Minimum records per location: {min_records}")
print(f"Median records per location: {median_records}")

# Dynamically adjust window size
optimal_window = min(24 * 7, int(min_records * 0.8))  # Use 80% of minimum records or 7 days
optimal_step = max(1, optimal_window // 24)  # Ensure at least 24 steps

print(f"\nUsing optimized parameters:")
print(f"Window size: {optimal_window} hours ({optimal_window/24:.1f} days)")
print(f"Step size: {optimal_step} hours")

# Filter locations with sufficient data
sufficient_locations = location_data_counts[location_data_counts >= optimal_window].index
ts_data_filtered = ts_data[ts_data.pickup_location_id.isin(sufficient_locations)]

# Transform with optimized parameters
features, targets = transform_ts_data_info_features_and_target(
    ts_data_filtered,
    window_size=optimal_window,
    step_size=optimal_step
)

print(f"\nFeatures generated: {features.shape}")
print(f"Locations included: {features['pickup_location_id'].nunique()}")

Locations with data: 251
Minimum records per location: 684
Median records per location: 684.0

Using optimized parameters:
Window size: 168 hours (7.0 days)
Step size: 7 hours

Features generated: (18574, 170)
Locations included: 251


In [103]:
features

Unnamed: 0,rides_t-168,rides_t-167,rides_t-166,rides_t-165,rides_t-164,rides_t-163,rides_t-162,rides_t-161,rides_t-160,rides_t-159,...,rides_t-8,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,pickup_location_id
0,170,82,74,41,110,13,39,38,39,115,...,48,85,16,642,49,79,124,118,2025-03-02 19:00:00,79
1,38,39,115,79,113,201,98,54,6,48,...,118,116,100,35,49,197,94,11,2025-02-13 15:00:00,79
2,54,6,48,5,89,62,155,89,28,17,...,11,70,29,53,55,30,15,7,2025-02-07 03:00:00,79
3,89,28,17,18,68,44,109,93,490,68,...,7,59,177,239,69,91,18,45,2025-02-04 19:00:00,79
4,93,490,68,219,81,136,58,109,90,27,...,45,84,94,49,87,40,75,18,2025-02-16 04:00:00,79
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18569,0,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,1,0,2025-02-07 06:00:00,259
18570,0,0,0,1,0,0,0,2,0,0,...,0,1,0,0,0,0,0,0,2025-02-10 06:00:00,259
18571,2,0,0,0,1,1,0,0,0,0,...,0,0,0,0,1,0,0,0,2025-02-25 17:00:00,259
18572,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2025-02-08 03:00:00,259


In [104]:
from src.inference import load_batch_of_features_from_store, get_feature_store
import pandas as pd
from datetime import timedelta
import src.config as config

# Get current date and feature store
current_date = pd.Timestamp.now(tz='Etc/UTC')
feature_store = get_feature_store()

try:
    # Step 1: Get raw data
    fetch_data_to = current_date - timedelta(hours=1)
    fetch_data_from = current_date - timedelta(days=14)  # Reduced to 14 days
    
    feature_view = feature_store.get_feature_view(
        name=config.FEATURE_VIEW_NAME,
        version=config.FEATURE_VIEW_VERSION
    )
    
    # Step 2: Fetch and prepare data
    ts_data = feature_view.get_batch_data(
        start_time=fetch_data_from,
        end_time=fetch_data_to
    )
    
    # Step 3: Analyze data availability
    location_data_counts = ts_data.groupby('pickup_location_id').size()
    min_records = location_data_counts.min()
    print(f"Minimum records per location: {min_records}")
    
    # Step 4: Calculate optimal window size
    optimal_window = min(24 * 7, int(min_records * 0.8))  # Use 80% of minimum records or 7 days
    optimal_step = max(1, optimal_window // 24)  # Ensure at least 24 steps
    
    print(f"\nOptimized parameters:")
    print(f"Window size: {optimal_window} hours ({optimal_window/24:.1f} days)")
    print(f"Step size: {optimal_step} hours")
    
    # Step 5: Transform with optimized parameters
    from src.data_utils import transform_ts_data_info_features
    features = transform_ts_data_info_features(
        ts_data,
        window_size=optimal_window,
        step_size=optimal_step
    )
    
    print(f"\nFeatures generated successfully:")
    print(f"Shape: {features.shape}")
    print(f"Locations: {features['pickup_location_id'].nunique()}")
    
except Exception as e:
    print(f"Error: {str(e)}")
    print("\nDebug Information:")
    if 'ts_data' in locals():
        print(f"Data shape: {ts_data.shape}")
        print(f"Date range: {ts_data.pickup_hour.min()} to {ts_data.pickup_hour.max()}")
    features = None

# Return the features
features


2025-03-04 11:05:55,609 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-03-04 11:05:55,625 INFO: Initializing external client
2025-03-04 11:05:55,625 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-03-04 11:05:56,150 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1215673
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (4.31s) 
Minimum records per location: 335

Optimized parameters:
Window size: 168 hours (7.0 days)
Step size: 7 hours
Error: cannot import name 'transform_ts_data_info_features' from 'src.data_utils' (d:\EAS-500\sp25_taxi-main\src\data_utils.py)

Debug Information:
Data shape: (84085, 3)
Date range: 2025-02-18 17:00:00+00:00 to 2025-03-04 15:00:00+00:00


In [105]:
current_date

Timestamp('2025-03-04 16:05:55.609423+0000', tz='Etc/UTC')

In [106]:
features

In [107]:
from src.inference import load_model_from_registry

model = load_model_from_registry()

2025-03-04 11:06:03,507 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-03-04 11:06:03,527 INFO: Initializing external client
2025-03-04 11:06:03,527 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-03-04 11:06:04,043 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1215673
Downloading model artifact (0 dirs, 1 files)... DONE

In [108]:
from src.inference import load_model_from_registry, get_model_predictions, get_feature_store
import pandas as pd
from datetime import timedelta
import src.config as config
import lightgbm as lgb

try:
    # Step 1: Get feature store and data
    feature_store = get_feature_store()
    current_date = pd.Timestamp.now(tz='Etc/UTC')
    fetch_data_to = current_date - timedelta(hours=1)
    fetch_data_from = fetch_data_to - timedelta(days=25)
    
    feature_view = feature_store.get_feature_view(
        name=config.FEATURE_VIEW_NAME,
        version=config.FEATURE_VIEW_VERSION
    )
    
    ts_data = feature_view.get_batch_data(
        start_time=fetch_data_from,
        end_time=fetch_data_to
    )
    
    ts_data['pickup_hour'] = ts_data['pickup_hour'].dt.tz_localize(None)
    ts_data = ts_data.sort_values(['pickup_location_id', 'pickup_hour'])
    
    print(f"Data loaded: {len(ts_data)} records")
    
    # Step 2: Generate features
    from src.data_utils import transform_ts_data_info_features_and_target
    features, targets = transform_ts_data_info_features_and_target(
        ts_data,
        window_size=504,
        step_size=24
    )
    
    # Step 3: Add missing required columns with zeros
    features['rides_t-672'] = 0
    
    # Step 4: Load model and modify its parameters
    model = load_model_from_registry()
    if isinstance(model, lgb.Booster):
        model.params['predict_disable_shape_check'] = True
    elif hasattr(model, 'steps') and isinstance(model.steps[-1][1], lgb.LGBMRegressor):
        model.steps[-1][1].set_params(predict_disable_shape_check=True)
    
    # Step 5: Generate predictions
    predictions = get_model_predictions(model, features)
    
    if predictions is not None and not predictions.empty:
        results = predictions.sort_values("predicted_demand", ascending=False)
        print("\nTop 10 locations by predicted demand:")
        print(results[["pickup_location_id", "predicted_demand"]].head(10))
        print(f"\nTotal predictions: {len(predictions)}")
        
        # Save predictions
        results.to_csv('/tmp/predictions.csv', index=False)
        print("\nPredictions saved to /tmp/predictions.csv")

except Exception as e:
    print(f"Error: {str(e)}")
    print("\nDebug Info:")
    if 'features' in locals():
        print(f"Available features shape: {features.shape}")
        print(f"Available columns: {features.columns.tolist()[:5]}")
    predictions = None

# Display predictions
predictions

2025-03-04 11:06:06,746 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-03-04 11:06:06,760 INFO: Initializing external client
2025-03-04 11:06:06,761 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-03-04 11:06:07,336 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1215673
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (4.87s) 
Data loaded: 150600 records
2025-03-04 11:06:16,895 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-03-04 11:06:16,903 INFO: Initializing external client
2025-03-04 11:06:16,903 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-03-04 11:06:17,493 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1215673
Downloading model artifact (0 dirs, 1 files)... DONE
Top 10 locations by predicted demand:
     pickup_location_id  predicted_demand
491          

Unnamed: 0,pickup_location_id,predicted_demand
0,2,0.0
1,2,0.0
2,2,0.0
3,2,0.0
4,3,0.0
...,...,...
999,262,24.0
1000,263,72.0
1001,263,37.0
1002,263,31.0


In [109]:
predictions

Unnamed: 0,pickup_location_id,predicted_demand
0,2,0.0
1,2,0.0
2,2,0.0
3,2,0.0
4,3,0.0
...,...,...
999,262,24.0
1000,263,72.0
1001,263,37.0
1002,263,31.0


In [110]:
predictions.sort_values("predicted_demand", ascending=False).head(10)["pickup_location_id"].values

array([132, 237, 132, 161, 132, 239, 186, 132, 236, 230])