In [17]:
%load_ext autoreload
%autoreload 2

In [1]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
import src.config as config

In [2]:
from src.inference import get_feature_store

In [3]:
from datetime import datetime, timedelta
import pandas as pd  

# Get the current datetime64[us, Etc/UTC]  
current_date = pd.Timestamp.now(tz='Etc/UTC')
feature_store = get_feature_store()

# read time-series data from the feature store
fetch_data_to = current_date - timedelta(hours=1)
fetch_data_from = current_date - timedelta(days=1*29)
print(f"Fetching data from {fetch_data_from} to {fetch_data_to}")
feature_view = feature_store.get_feature_view(
    name=config.FEATURE_VIEW_NAME, version=config.FEATURE_VIEW_VERSION
)

ts_data = feature_view.get_batch_data(
    start_time=(fetch_data_from - timedelta(days=1)),
    end_time=(fetch_data_to + timedelta(days=1)),
)
ts_data = ts_data[ts_data.pickup_hour.between(fetch_data_from, fetch_data_to)]

2025-03-06 11:15:08,906 INFO: Initializing external client
2025-03-06 11:15:08,908 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-03-06 11:15:09,800 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1214708
Fetching data from 2025-02-05 16:15:08.898245+00:00 to 2025-03-06 15:15:08.898245+00:00
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (4.69s) 


In [4]:
ts_data.sort_values(["pickup_location_id", "pickup_hour"]).reset_index(drop=True)

Unnamed: 0,pickup_hour,pickup_location_id,rides
0,2025-02-06 06:00:00+00:00,2,0
1,2025-02-06 07:00:00+00:00,2,0
2,2025-02-06 08:00:00+00:00,2,0
3,2025-02-06 09:00:00+00:00,2,0
4,2025-02-06 10:00:00+00:00,2,0
...,...,...,...
169843,2025-03-06 03:00:00+00:00,263,4
169844,2025-03-06 04:00:00+00:00,263,6
169845,2025-03-06 05:00:00+00:00,263,39
169846,2025-03-06 06:00:00+00:00,263,73


In [5]:
ts_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169848 entries, 0 to 169847
Data columns (total 3 columns):
 #   Column              Non-Null Count   Dtype                  
---  ------              --------------   -----                  
 0   pickup_hour         169848 non-null  datetime64[us, Etc/UTC]
 1   pickup_location_id  169848 non-null  int32                  
 2   rides               169848 non-null  int32                  
dtypes: datetime64[us, Etc/UTC](1), int32(2)
memory usage: 2.6 MB


In [6]:
ts_data["pickup_hour"] = ts_data["pickup_hour"].dt.tz_localize(None)

In [7]:
ts_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169848 entries, 0 to 169847
Data columns (total 3 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   pickup_hour         169848 non-null  datetime64[us]
 1   pickup_location_id  169848 non-null  int32         
 2   rides               169848 non-null  int32         
dtypes: datetime64[us](1), int32(2)
memory usage: 2.6 MB


In [8]:
# First, let's analyze data availability per location
location_data_counts = ts_data.groupby('pickup_location_id').size()
print(f"Locations with data: {len(location_data_counts)}")

# Calculate optimal window size based on data availability
min_records = location_data_counts.min()
median_records = location_data_counts.median()
print(f"Minimum records per location: {min_records}")
print(f"Median records per location: {median_records}")

# Dynamically adjust window size
optimal_window = min(24 * 7, int(min_records * 0.8))  # Use 80% of minimum records or 7 days
optimal_step = max(1, optimal_window // 24)  # Ensure at least 24 steps

print(f"\nUsing optimized parameters:")
print(f"Window size: {optimal_window} hours ({optimal_window/24:.1f} days)")
print(f"Step size: {optimal_step} hours")

# Filter locations with sufficient data
sufficient_locations = location_data_counts[location_data_counts >= optimal_window].index
ts_data_filtered = ts_data[ts_data.pickup_location_id.isin(sufficient_locations)]
from src.data_utils import transform_ts_data_info_features_and_target

# Transform with optimized parameters
features, targets = transform_ts_data_info_features_and_target(
    ts_data_filtered,
    window_size=optimal_window,
    step_size=optimal_step
)

print(f"\nFeatures generated: {features.shape}")
print(f"Locations included: {features['pickup_location_id'].nunique()}")

Locations with data: 252
Minimum records per location: 674
Median records per location: 674.0

Using optimized parameters:
Window size: 168 hours (7.0 days)
Step size: 7 hours

Features generated: (18396, 170)
Locations included: 252


In [9]:
features

Unnamed: 0,rides_t-168,rides_t-167,rides_t-166,rides_t-165,rides_t-164,rides_t-163,rides_t-162,rides_t-161,rides_t-160,rides_t-159,...,rides_t-8,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,pickup_location_id
0,0,50,34,13,13,4,38,55,35,0,...,15,32,12,4,1,25,45,40,2025-02-06 16:00:00,261
1,55,35,0,13,5,58,16,29,0,4,...,40,31,58,32,19,26,50,9,2025-02-18 13:00:00,261
2,29,0,4,49,3,11,1,2,0,41,...,9,19,43,1,28,42,23,27,2025-02-22 00:00:00,261
3,2,0,41,43,0,20,22,12,0,41,...,27,5,0,32,15,3,37,14,2025-02-15 01:00:00,261
4,12,0,41,1,16,0,2,1,23,6,...,14,9,25,5,13,14,78,1,2025-02-28 22:00:00,261
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18391,0,0,0,0,2,0,0,1,0,0,...,0,0,0,0,0,0,0,1,2025-02-12 04:00:00,150
18392,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,2025-03-03 03:00:00,150
18393,0,0,0,0,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,2025-02-09 05:00:00,150
18394,2,0,0,0,1,0,0,0,0,0,...,0,0,0,0,2,0,0,0,2025-03-01 22:00:00,150


In [10]:
from src.inference import load_batch_of_features_from_store, get_feature_store
import pandas as pd
from datetime import timedelta
import src.config as config

# Get current date and feature store
current_date = pd.Timestamp.now(tz='Etc/UTC')
feature_store = get_feature_store()

try:
    # Step 1: Get raw data
    fetch_data_to = current_date - timedelta(hours=1)
    fetch_data_from = current_date - timedelta(days=14)  # Reduced to 14 days
    
    feature_view = feature_store.get_feature_view(
        name=config.FEATURE_VIEW_NAME,
        version=config.FEATURE_VIEW_VERSION
    )
    
    # Step 2: Fetch and prepare data
    ts_data = feature_view.get_batch_data(
        start_time=fetch_data_from,
        end_time=fetch_data_to
    )
    
    # Step 3: Analyze data availability
    location_data_counts = ts_data.groupby('pickup_location_id').size()
    min_records = location_data_counts.min()
    print(f"Minimum records per location: {min_records}")
    
    # Step 4: Calculate optimal window size
    optimal_window = min(24 * 7, int(min_records * 0.8))  # Use 80% of minimum records or 7 days
    optimal_step = max(1, optimal_window // 24)  # Ensure at least 24 steps
    
    print(f"\nOptimized parameters:")
    print(f"Window size: {optimal_window} hours ({optimal_window/24:.1f} days)")
    print(f"Step size: {optimal_step} hours")
    
    # Step 5: Transform with optimized parameters
    from src.data_utils import transform_ts_data_info_features
    features = transform_ts_data_info_features(
        ts_data,
        window_size=optimal_window,
        step_size=optimal_step
    )
    
    print(f"\nFeatures generated successfully:")
    print(f"Shape: {features.shape}")
    print(f"Locations: {features['pickup_location_id'].nunique()}")
    
except Exception as e:
    print(f"Error: {str(e)}")
    print("\nDebug Information:")
    if 'ts_data' in locals():
        print(f"Data shape: {ts_data.shape}")
        print(f"Date range: {ts_data.pickup_hour.min()} to {ts_data.pickup_hour.max()}")
    features = None

# Return the features
features


2025-03-06 11:15:19,918 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-03-06 11:15:19,926 INFO: Initializing external client
2025-03-06 11:15:19,929 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-03-06 11:15:20,661 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1214708
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (5.04s) 
Minimum records per location: 327

Optimized parameters:
Window size: 168 hours (7.0 days)
Step size: 7 hours

Features generated successfully:
Shape: (5796, 170)
Locations: 252


Unnamed: 0,rides_t-168,rides_t-167,rides_t-166,rides_t-165,rides_t-164,rides_t-163,rides_t-162,rides_t-161,rides_t-160,rides_t-159,...,rides_t-8,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,pickup_location_id
0,0,50,34,38,55,13,58,16,4,3,...,21,11,0,11,37,2,2,33,2025-02-24 20:00:00,261
1,16,4,3,1,2,41,0,22,1,16,...,33,14,11,50,25,8,0,2,2025-03-04 01:00:00,261
2,22,1,16,2,23,6,4,2,49,25,...,2,0,13,0,0,31,26,45,2025-03-02 00:00:00,261
3,2,49,25,16,2,45,13,13,42,46,...,45,7,35,5,15,7,43,10,2025-03-02 05:00:00,261
4,13,42,46,24,18,5,19,62,5,0,...,10,1,32,1,12,0,48,14,2025-02-23 05:00:00,261
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5791,183,242,105,73,2,263,283,186,130,19,...,2,116,82,61,378,276,182,2,2025-03-02 12:00:00,234
5792,186,130,19,190,152,6,239,7,136,237,...,2,154,29,13,5,101,134,0,2025-02-22 02:00:00,234
5793,7,136,237,8,20,21,211,209,29,43,...,0,53,115,14,113,41,130,12,2025-02-27 15:00:00,234
5794,209,29,43,231,134,10,61,126,199,107,...,12,189,152,107,204,1,96,67,2025-03-01 08:00:00,234


In [11]:
current_date

Timestamp('2025-03-06 16:15:19.918369+0000', tz='Etc/UTC')

In [12]:
features

Unnamed: 0,rides_t-168,rides_t-167,rides_t-166,rides_t-165,rides_t-164,rides_t-163,rides_t-162,rides_t-161,rides_t-160,rides_t-159,...,rides_t-8,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,pickup_location_id
0,0,50,34,38,55,13,58,16,4,3,...,21,11,0,11,37,2,2,33,2025-02-24 20:00:00,261
1,16,4,3,1,2,41,0,22,1,16,...,33,14,11,50,25,8,0,2,2025-03-04 01:00:00,261
2,22,1,16,2,23,6,4,2,49,25,...,2,0,13,0,0,31,26,45,2025-03-02 00:00:00,261
3,2,49,25,16,2,45,13,13,42,46,...,45,7,35,5,15,7,43,10,2025-03-02 05:00:00,261
4,13,42,46,24,18,5,19,62,5,0,...,10,1,32,1,12,0,48,14,2025-02-23 05:00:00,261
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5791,183,242,105,73,2,263,283,186,130,19,...,2,116,82,61,378,276,182,2,2025-03-02 12:00:00,234
5792,186,130,19,190,152,6,239,7,136,237,...,2,154,29,13,5,101,134,0,2025-02-22 02:00:00,234
5793,7,136,237,8,20,21,211,209,29,43,...,0,53,115,14,113,41,130,12,2025-02-27 15:00:00,234
5794,209,29,43,231,134,10,61,126,199,107,...,12,189,152,107,204,1,96,67,2025-03-01 08:00:00,234


In [13]:
from src.inference import load_model_from_registry

model = load_model_from_registry()

2025-03-06 11:15:32,343 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-03-06 11:15:32,351 INFO: Initializing external client
2025-03-06 11:15:32,351 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-03-06 11:15:33,089 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1214708
Downloading model artifact (0 dirs, 1 files)... DONE

In [14]:
from src.inference import load_model_from_registry, get_model_predictions, get_feature_store
import pandas as pd
from datetime import timedelta
import src.config as config
import lightgbm as lgb

try:
    # Step 1: Get feature store and data
    feature_store = get_feature_store()
    current_date = pd.Timestamp.now(tz='Etc/UTC')
    fetch_data_to = current_date - timedelta(hours=1)
    fetch_data_from = fetch_data_to - timedelta(days=25)
    
    feature_view = feature_store.get_feature_view(
        name=config.FEATURE_VIEW_NAME,
        version=config.FEATURE_VIEW_VERSION
    )
    
    ts_data = feature_view.get_batch_data(
        start_time=fetch_data_from,
        end_time=fetch_data_to
    )
    
    ts_data['pickup_hour'] = ts_data['pickup_hour'].dt.tz_localize(None)
    ts_data = ts_data.sort_values(['pickup_location_id', 'pickup_hour'])
    
    print(f"Data loaded: {len(ts_data)} records")
    
    # Step 2: Generate features
    from src.data_utils import transform_ts_data_info_features_and_target
    features, targets = transform_ts_data_info_features_and_target(
        ts_data,
        window_size=504,
        step_size=24
    )
    
    # Step 3: Add missing required columns with zeros
    features['rides_t-672'] = 0
    
    # Step 4: Load model and modify its parameters
    model = load_model_from_registry()
    if isinstance(model, lgb.Booster):
        model.params['predict_disable_shape_check'] = True
    elif hasattr(model, 'steps') and isinstance(model.steps[-1][1], lgb.LGBMRegressor):
        model.steps[-1][1].set_params(predict_disable_shape_check=True)
    
    # Step 5: Generate predictions
    predictions = get_model_predictions(model, features)
    
    if predictions is not None and not predictions.empty:
        results = predictions.sort_values("predicted_demand", ascending=False)
        print("\nTop 10 locations by predicted demand:")
        print(results[["pickup_location_id", "predicted_demand"]].head(10))
        print(f"\nTotal predictions: {len(predictions)}")
        
        # Save predictions
        results.to_csv('/tmp/predictions.csv', index=False)
        print("\nPredictions saved to /tmp/predictions.csv")

except Exception as e:
    print(f"Error: {str(e)}")
    print("\nDebug Info:")
    if 'features' in locals():
        print(f"Available features shape: {features.shape}")
        print(f"Available columns: {features.columns.tolist()[:5]}")
    predictions = None

# Display predictions
predictions

2025-03-06 11:15:35,188 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-03-06 11:15:35,199 INFO: Initializing external client
2025-03-06 11:15:35,201 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-03-06 11:15:35,957 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1214708
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (5.88s) 
Data loaded: 149184 records
2025-03-06 11:15:46,037 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-03-06 11:15:46,045 INFO: Initializing external client
2025-03-06 11:15:46,047 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-03-06 11:15:46,733 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1214708
Downloading model artifact (0 dirs, 1 files)... DONE
Top 10 locations by predicted demand:
     pickup_location_id  predicted_demand
901          

Unnamed: 0,pickup_location_id,predicted_demand
0,2,0.0
1,2,0.0
2,2,0.0
3,2,0.0
4,3,0.0
...,...,...
1003,262,32.0
1004,263,29.0
1005,263,50.0
1006,263,43.0


In [15]:
predictions

Unnamed: 0,pickup_location_id,predicted_demand
0,2,0.0
1,2,0.0
2,2,0.0
3,2,0.0
4,3,0.0
...,...,...
1003,262,32.0
1004,263,29.0
1005,263,50.0
1006,263,43.0


In [16]:
predictions.sort_values("predicted_demand", ascending=False).head(10)["pickup_location_id"].values

array([237, 236, 237, 161, 236, 161, 237, 162, 237, 161])