In [None]:
!pip install colorama lightgbm xgboost

Collecting colorama
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: colorama
Successfully installed colorama-0.4.6


In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
import json
import pickle
import gzip
import os
import glob
import datetime
import warnings
warnings.filterwarnings('ignore')

np.random.seed(7)
from colorama import Fore
from lightgbm import LGBMRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from matplotlib import pyplot
from sklearn.metrics import mean_absolute_error

In [None]:
weather_columns = ['date', 'avg_wind_speed', 'peak_gust_time', 'precipitation', 'snow', 'snow_depth', 'temp_avg', 'temp_max', 'temp_min', 'tot_sunshine',
                  'dir_fwind_2min', 'dir_fwind_5min', 'speed_fwind_2min', 'speed_fwind_5min', 'fog', 'heavy_fog', 'thunder', 'ice_pellets', 'hail', 'glaze', 'smoke']
traffic_columns = ['PUZone', 'Count', 'PUTime']

In [None]:
from datetime import datetime, timedelta

def datetime_range(end, delta, count):

        current = datetime.strptime(end, '%Y-%m-%d %H:%M:%S')
        for i in range(count):
            yield current
            current -= delta

In [None]:
from sklearn.model_selection import cross_validate
def evaluate(model, X, y, cv):
    cv_results = cross_validate(
        model,
        X,
        y,
        cv=cv,
        scoring=["neg_mean_absolute_error", "neg_root_mean_squared_error"],
    )
    mae = -cv_results["test_neg_mean_absolute_error"]
    rmse = -cv_results["test_neg_root_mean_squared_error"]
    print(
        "Cross validation Mean Absolute Error:",mae
    )


In [None]:
high_zone = [0] * 73
for h in [1, 15, 24, 25, 26, 30, 31, 37, 41, 43, 44, 46, 62, 69]:
    high_zone[h] = 1

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np
import os

class Model():

    def __init__(self):
        self.random_forest = RandomForestRegressor(n_estimators=100)
        self.decision_tree = DecisionTreeRegressor()
        self.mlp = MLPRegressor(hidden_layer_sizes=(300,150), max_iter=200,activation ='relu',solver='adam',random_state=1)
        self.xgb = XGBRegressor(objective='reg:squarederror', n_estimators=1000)
        self.lgbm = LGBMRegressor(learning_rate = 0.01, num_iterations = 1000)
        self.svr = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))
#         self.ensemble = VotingRegressor([('lgbm', self.lgbm), ('xgb', self.xgb), ('svr', self.svr)])
        self.ensemble = VotingRegressor([('lgbm', self.lgbm), ('xgb', self.xgb), ('rf', self.random_forest), ('mlp', self.mlp)])
        self.count = 0
        self.to_keep = []
        self.weather_to_keep = []
        self.train_drop_list = ['Count(0)', 'date']
        # self.model = self.ensemble
        self.model = self.ensemble
        return

    def train_model(self, traffic, weather):
        df = self.preprocessing(traffic, weather)
        # Create the 'data' directory if it doesn't exist
        if not os.path.exists('data'):
            os.makedirs('data')
        df.to_csv('data/new_merged.csv', index=False)

        train = df.loc[df.date < '2017-05-01']
        valid = df.loc[df.date >= '2017-05-01']

        ts_cv = TimeSeriesSplit(
            n_splits=5,
            gap=48,
            max_train_size=10000,
            test_size=1000,
        )

        train_y = train['Count(0)']

        train_x = train.drop(self.train_drop_list, axis=1)

        self.to_keep = train_x.columns

        valid_y = valid['Count(0)']

        valid_x = valid.drop(self.train_drop_list, axis=1)

        self.model.fit(train_x,train_y)
        pred_y = self.model.predict(valid_x)
        valid_y_array = valid_y.values.ravel()
        mae = mean_absolute_error(valid_y_array,pred_y)
        print('Validation MAE: ', mae)
        evaluate(self.model, df.drop(self.train_drop_list, axis=1), df['Count(0)'], cv=ts_cv)

    def preprocessing(self, traffic, weather):


        # extract date and hour

        traffic['PUTime'] = pd.to_datetime(traffic['PUTime'])
        traffic['date'] = traffic['PUTime'].dt.date
        traffic['hour'] = traffic['PUTime'].dt.hour
        traffic['weekday'] = (traffic['PUTime'].dt.dayofweek < 5).astype(int)

        traffic['peak_hour'] = (traffic['hour'] >= 16) * (traffic['hour'] <= 20) + (traffic['weekday'] == 1) * (traffic['hour'] >= 6) * (traffic['hour'] <= 10)
        traffic['peak_hour'] = traffic['peak_hour'].astype(int)

        # group by zones
        grouped = traffic.groupby(traffic.PUZone)
        dfs = []
        for i in range(0,73):
            dfs.append(grouped.get_group(i))

        for zone in range(0,73) :
            dfs[zone].drop(['PUZone'], axis=1,inplace=True)



        # Preprocess weather
        weather.columns = weather_columns

        weather['date'] = pd.DatetimeIndex(weather['date']).date

        # handle null values
        weather = weather.dropna(how='all')
        weather = weather.drop(['peak_gust_time', 'temp_avg', 'tot_sunshine', 'thunder', 'ice_pellets', 'hail', 'glaze'], axis=1)

        ### replace with 0
        for col in weather.columns:
            weather[col] = weather[col].fillna(0)


        weather = weather[['date']+self.weather_to_keep]

        X2 = []
        # add same hour for previous 30 days
        for zone in range(0,73) :
            dfs[zone]["PUTime"] = pd.to_datetime(dfs[zone]["PUTime"]) # Convert column type to be datetime
            indexed_df = dfs[zone].set_index(["PUTime"])           # Create a datetime index
            indexed_df.drop(['date'],axis=1,inplace=True)
            indexed_df.drop(['hour'],axis=1,inplace=True)
            indexed_df.drop(['weekday'], axis=1, inplace=True)
            indexed_df.drop(['peak_hour'], axis=1, inplace=True)


            n_steps_in = 30
            X_new = pd.DataFrame()
            for k in range(len(indexed_df.columns)) :
                for i in range(0, n_steps_in):
                    for j in range(-1, 1, 1):
                        X_new[indexed_df.columns[k] + '(' + str(-24*i+j) + ')'] = indexed_df.iloc[:,k].shift(24*i-j)
                    if i<=24:
                        X_new[indexed_df.columns[k] + '(' + str(-i) + ')'] = indexed_df.iloc[:,k].shift(i)
                X_new[indexed_df.columns[k] + '(' + str(-24*30) + ')'] = indexed_df.iloc[:,k].shift(24*30)
            X_new = X_new.iloc[:, ::-1]
            X_new['date'] = dfs[zone]['date'].values
            X_new['hour'] = dfs[zone]['hour'].values
            X_new['weekday'] = dfs[zone]['weekday'].values
            X_new['peak_hour'] = dfs[zone]['peak_hour'].values

            X2.append(X_new)


        for zone in range(0,73):
            X2[zone] = X2[zone].dropna()

        dfW2 = []
        for zone in range(0,73):
            X2[zone]['date'] = X2[zone]['date'].astype(str)
            weather['date'] = weather['date'].astype(str)
            dfW2.append(X2[zone].merge(weather, on='date'))


        for zone in range(0,73):
            dfW2[zone]['PUZone'] = zone
            dfW2[zone]['highZone'] = high_zone[zone]


        dfAll2 = pd.concat(dfW2, axis=0)
        return dfAll2

    def preprocess_test(self, demand, weather, dt, neighbour):

        df = pd.DataFrame(columns=self.to_keep)
        df['PUZone'] = [i for i in range(0, 73)]
        df['highZone'] = high_zone

        pred_date_time_obj = datetime.strptime(dt, '%Y-%m-%d %H:%M:%S')
        pred_date, pred_time = dt.split(" ")
        df['weekday'] = (pred_date_time_obj.weekday() < 5)
        df['weekday'] = df['weekday'].astype(int)
        df['hour'] = pred_date_time_obj.hour

        df['peak_hour'] = (df['hour'] >= 16) * (df['hour'] <= 20) + (df['weekday'] == 1) * (df['hour'] >= 6) * (df['hour'] <= 10)
        df['peak_hour'] = df['peak_hour'].astype(int)


        for i in range(1, 25):
            df['Count('+str(-i)+')'] = demand[-i]
        for i in range(1, 30):
            for j in range(-1, 1, 1):
                df['Count('+str(-i*24+j)+')'] = demand[-i*24+j]
        df['Count('+str(-30*24)+')'] = demand[-30*24]

        w = weather[-1]
        for i in range(len(weather_columns)):
            if weather_columns[i] in self.to_keep:
                df[weather_columns[i]] = [w[i]] * 73
                df[weather_columns[i]] = df[weather_columns[i]].interpolate()


        # fill nulls
        df = df.fillna(0)
        return df


    def predict(self, demand, weather, dt, neighbors):
        '''
        Parameters
        ----------
        demand: (24*30, 73) numpy array containing last 30 days' hourly demand data, e.g. demand[-1, 3] contains last hour's demand of zone 3
        weather: List of lists containing today's and last 30 days' weather data, e.g., weather[-1] is a list containing today's weather data with [DATE, AWND,...,WT08] as in weather.csv
        dt: date and time of the prediction e.g., "2017-06-01 00:00:00"
        neighbors - Dictionary containing the mapping between each zone and their list of neighbors in zone_neighbors.json

        Return
        ------
        predictions: List of 73 non-negative integers - your trip forecast for each zone in the next hour
        '''
        df = self.preprocess_test(demand, weather, dt, neighbors)
        pred_y = self.model.predict(df)

        self.count += 1
        if self.count % 100 == 0:
            print(self.count, ' times done.')

        return pred_y

In [None]:
traffic = pd.concat(map(pd.read_csv, ['/data/2017-01_1H_zone.csv', '/data/2017-02_1H_zone.csv',
                                     '/data/2017-03_1H_zone.csv', '/data/2017-04_1H_zone.csv', '/data/2017-05_1H_zone.csv']))
traffic.drop(['Unnamed: 0'], axis=1,inplace=True)

weather = pd.read_csv('/data/weather.csv')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Run this in a notebook cell in your project root
import re
from pathlib import Path
from shutil import copy2

ROOT = Path('.')  # change if needed
py_files = list(ROOT.rglob('*.py'))

# Patterns & replacements (safe and minimal)
replacements = [
    # replace drop([...], 1)  -> drop([...], axis=1)
    (re.compile(r"\.drop\(\s*(\[[^\]]+\])\s*,\s*1\s*\)"), r".drop(\1, axis=1)"),
    # replace drop([...], 0) -> drop([...], axis=0)  (less common but consistent)
    (re.compile(r"\.drop\(\s*(\[[^\]]+\])\s*,\s*0\s*\)"), r".drop(\1, axis=0)"),
    # replace .corr() -> .corr(numeric_only=True)
    (re.compile(r"\.corr\(\s*\)"), r".corr(numeric_only=True)"),
    # replace .mean() on groupby(...) pattern -> .mean(numeric_only=True)
    (re.compile(r"\.mean\(\s*\)"), r".mean(numeric_only=True)"),
    # (Optional) convert pd.DatetimeIndex(...).date -> pd.to_datetime(...).dt.date
    (re.compile(r"pd\.DatetimeIndex\(([^)]+)\)\.date"), r"pd.to_datetime(\1).dt.date"),
]

changed_files = []

for p in py_files:
    text = p.read_text(encoding='utf-8')
    new_text = text
    for pattern, repl in replacements:
        new_text = pattern.sub(repl, new_text)

    if new_text != text:
        # backup original
        backup = p.with_suffix(p.suffix + '.bak')
        copy2(str(p), str(backup))
        p.write_text(new_text, encoding='utf-8')
        changed_files.append((p, backup))

print(f"Scanned {len(py_files)} python files.")
if changed_files:
    print("Modified files:")
    for f, b in changed_files:
        print(" -", f" (backup saved as {b.name})")
else:
    print("No changes were needed.")


Scanned 34 python files.
No changes were needed.


In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
model = Model()
model.train_model(traffic, weather)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.184059 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21013
[LightGBM] [Info] Number of data points in the train set: 157680, number of used features: 87
[LightGBM] [Info] Start training from score 160.554484
Validation MAE:  15.462454228652101
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011533 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 20946
[LightGBM] [Info] Number of data points in the train set: 10000, number of used features: 87
[LightGBM] [Info] Start training from score 164.650500
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011317 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 20945
[LightGBM] [Info] Number of data points in the train set: 10000, number of u

In [None]:

def test_pred_eval(model, test, test_y):
    test_size = len(test)

    test_preds = []
    for i in range(len(test)):
        test_preds.append(model.predict(test[i]['demand'], test[i]['weather'], test[i]['dt'], test[i]['neighbors']))

    test_preds = np.array(test_preds)
    test_y = np.array(test_y)

    mae = mean_absolute_error(test_y.flatten(), test_preds.flatten())

    zone_mae = []
    for i in range(test_y.shape[1]):
        zone_mae.append(mean_absolute_error(test_y[:, i], test_preds[:, i]))
    return mae, zone_mae



In [None]:
# Assuming dfAll2 is the preprocessed merged dataframe
dfAll2 = model.preprocessing(traffic, weather)

# Mean demand
mean_demand = dfAll2['Count(0)'].mean()

# Validation MAE from your last run
val_mae = 15.462454228652101

# Compute accuracy
accuracy = 100 * (1 - val_mae / mean_demand)
print(f"Validation Accuracy: {accuracy:.2f}%")


Validation Accuracy: 90.31%


In [None]:
!ls -lh /data/test.pkl.gz
!ls -lh /data/test_answer.pkl.gz

-rw-r--r-- 1 root root 58M Aug 29 04:40 /data/test.pkl.gz
-rw-r--r-- 1 root root 84K Aug 29 04:40 /data/test_answer.pkl.gz


In [None]:
# run in a Colab cell
!gunzip -t /content/data/test.pkl.gz && echo "test.pkl.gz OK" || echo "test.pkl.gz CORRUPT"
!gunzip -t /content/data/test_answer.pkl.gz && echo "test_answer.pkl.gz OK" || echo "test_answer.pkl.gz CORRUPT"

gzip: /content/data/test.pkl.gz: No such file or directory
test.pkl.gz CORRUPT
gzip: /content/data/test_answer.pkl.gz: No such file or directory
test_answer.pkl.gz CORRUPT


In [None]:
# sizes & gzip listing
!ls -lh /content/data/test.pkl.gz
!gzip -l /content/data/test.pkl.gz || true

ls: cannot access '/content/data/test.pkl.gz': No such file or directory
gzip: /content/data/test.pkl.gz: No such file or directory


In [None]:
!df -h /content

Filesystem      Size  Used Avail Use% Mounted on
overlay         108G   40G   69G  37% /


In [None]:
!gunzip -t /content/data/test.pkl.gz && echo "gzip stream OK" || echo "gzip stream CORRUPT"

gzip: /content/data/test.pkl.gz: No such file or directory
gzip stream CORRUPT


In [None]:
# streaming decompression to get any recoverable bytes
import zlib, os
gz_path = "/data/test.pkl.gz"
out_path = "/data/test_partial_from_zlib.pkl"
b = open(gz_path, "rb").read()
dec = zlib.decompressobj(16 + zlib.MAX_WBITS)
part = dec.decompress(b)
open(out_path, "wb").write(part)
print("Wrote partial bytes:", os.path.getsize(out_path))

Wrote partial bytes: 302893539


In [None]:
import pickle

path = "/data/test.pkl.gz"   # or "/content/data/test.pkl.gz" if you prefer streaming from gzip
try:
    with open(path, "rb") as f:
        obj = pickle.load(f)
    print("Loaded OK, type:", type(obj))
except EOFError as e:
    print("EOFError while loading:", e)
except Exception as e:
    print(type(e).__name__, e)


UnpicklingError invalid load key, '\x1f'.


In [None]:
with gzip.open("/data/test.pkl.gz") as file:
    test = pickle.load(file)

with gzip.open("/data/test_answer.pkl.gz") as file:
    test_y = pickle.load(file)

In [None]:
mae, zone_mae = test_pred_eval(model, test, test_y)

100  times done.
200  times done.
300  times done.
400  times done.
500  times done.
600  times done.
700  times done.


In [None]:
print(mae)

15.665084654935269


In [None]:
print(zone_mae)

[24.224795643824937, 26.393979408584922, 18.181690773160508, 17.72181243577421, 4.16037605176498, 0.6242644944097252, 2.649359262934767, 0.25341143531782134, 10.598776744311019, 1.12093011922688, 12.757795581777977, 0.27714347924544036, 16.22976518285137, 18.548002171757947, 23.11407478507401, 36.38684749992172, 13.270005175582918, 15.007733159312503, 3.5104580969701953, 18.845266701629107, 11.74132800455735, 3.8123032636831553, 18.058908223832134, 0.6688025557284291, 46.304892095374825, 37.835310533457104, 29.906996930352214, 25.032456071238894, 10.886311008009708, 1.4139883930173158, 32.845993086049184, 47.98777066312304, 0.6076589927958818, 4.8573964712751625, 12.174235585446569, 6.277542830394926, 20.007401986107016, 35.31127555241888, 21.794225356100974, 4.709476801000041, 17.567613196508624, 31.550526852049874, 0.5358079242624495, 38.63500028781419, 41.444030343881764, 24.463712869264793, 28.41297769578455, 6.380149778666378, 1.5646737813274718, 3.8339364780581855, 22.74715121152

In [None]:
import pickle
import gzip

# Assuming 'model' is the trained Model object from cell 61283838
# Make sure cell 61283838 has been executed successfully to train the model

model_filename = 'taxi_demand_model.pkl.gz'

with gzip.open(model_filename, 'wb') as f:
    pickle.dump(model.model, f)

print(f"Trained model saved to {model_filename}")

Trained model saved to taxi_demand_model.pkl.gz


In [None]:
!pip install Flask



In [None]:
from flask import Flask, request, jsonify
import pickle
import gzip
import numpy as np
import pandas as pd
from datetime import datetime

# Define the Model class again, or import it if it's in a separate file
# Assuming the Model class definition from cell 814c9a3d is available

# Load the trained model
model_filename = 'taxi_demand_model.pkl.gz'
try:
    with gzip.open(model_filename, 'rb') as f:
        loaded_model = pickle.load(f)
    print("Model loaded successfully!")
except FileNotFoundError:
    print(f"Error: Model file '{model_filename}' not found. Please run the cell to save the model first.")
    loaded_model = None
except Exception as e:
    print(f"Error loading model: {e}")
    loaded_model = None


app = Flask(__name__)

@app.route('/predict', methods=['POST'])
def predict():
    if loaded_model is None:
        return jsonify({'error': 'Model not loaded. Please check the model file.'}), 500

    try:
        data = request.get_json()

        # Extract data from the request
        demand_data = data.get('demand')
        weather_data = data.get('weather')
        dt = data.get('dt')
        neighbors_data = data.get('neighbors')

        # Convert input data to the format expected by the predict method
        # This might require careful handling of data types and shapes
        # Assuming demand_data is a list of lists that needs to be a numpy array
        demand_array = np.array(demand_data)
        # Assuming weather_data is a list of lists
        weather_list = weather_data
        # dt is a string
        # neighbors_data is a dictionary

        # Create an instance of the Model class to use the preprocess_test and predict methods
        # We need to re-initialize the Model class or load its state if preprocess_test depends on it
        # For simplicity, assuming preprocess_test can work with just the loaded_model and input data
        # If preprocess_test requires the full Model class instance, you'll need to adjust this part.
        # For now, let's assume the necessary attributes like self.to_keep are available or passed.

        # A more robust approach would be to save and load the entire Model class instance
        # or refactor preprocess_test to be a static method or a separate function.

        # **Important:** The preprocess_test method in the original Model class
        # relies on `self.to_keep` and `weather_columns`.
        # To make this API work, you need to either:
        # 1. Save the entire Model instance (including `self.to_keep`) and load it.
        # 2. Pass `self.to_keep` and `weather_columns` with the prediction request,
        #    or store them globally/in a config file accessible by the API.
        # 3. Refactor `preprocess_test` to accept these as arguments.

        # For this example, let's assume `self.to_keep` and `weather_columns` are accessible
        # (e.g., defined globally or loaded from a config).
        # In a real application, you would need to handle this properly.

        # Dummy implementation of preprocess_test for demonstration
        # Replace with the actual logic from your Model class
        def preprocess_test_for_api(demand, weather, dt, neighbors, to_keep, weather_cols):
             # This is a simplified placeholder.
             # You need to adapt the logic from your Model class's preprocess_test
             # to work independently or with necessary parameters passed.
             df = pd.DataFrame(columns=to_keep)
             df['PUZone'] = [i for i in range(0, 73)]
             # Assuming high_zone is globally accessible or passed
             df['highZone'] = high_zone # You would need to make high_zone available
             pred_date_time_obj = datetime.strptime(dt, '%Y-%m-%d %H:%M:%S')
             df['weekday'] = (pred_date_time_obj.weekday() < 5).astype(int)
             df['hour'] = pred_date_time_obj.hour
             df['peak_hour'] = ((df['hour'] >= 16) & (df['hour'] <= 20)).astype(int) + ((df['weekday'] == 1) & (df['hour'] >= 6) & (df['hour'] <= 10)).astype(int)

             # Add lagged demand features - requires careful indexing from demand array
             # This part needs to be adapted from your original preprocess_test
             # Example (needs refinement based on your exact logic):
             for i in range(1, 25):
                 # Ensure demand_array has enough data
                 if len(demand) >= i:
                    # This assumes demand_array is structured such that demand[-i] gives the correct data
                    # based on your original implementation. You might need to slice/index differently.
                    df[f'Count({-i})'] = demand[-i]
                 else:
                     df[f'Count({-i})'] = 0 # Or handle missing data appropriately

             # Add lagged demand for previous days (24*i+j logic)
             for i in range(1, 30):
                 for j in range(-1, 1, 1):
                     lag = i * 24 - j
                     if len(demand) >= lag:
                          df[f'Count({-lag})'] = demand[-lag]
                     else:
                          df[f'Count({-lag})'] = 0

             lag_30_days = 30 * 24
             if len(demand) >= lag_30_days:
                  df[f'Count({-lag_30_days})'] = demand[-lag_30_days]
             else:
                  df[f'Count({-lag_30_days})'] = 0


             # Add weather features - requires careful indexing from weather_list
             # This part needs to be adapted from your original preprocess_test
             # Assuming weather_list[-1] is the current weather data
             if weather: # Check if weather_list is not empty
                w = weather[-1]
                for i, col_name in enumerate(weather_cols):
                    if col_name in to_keep and i < len(w):
                         df[col_name] = [w[i]] * 73 # Assuming weather applies to all zones

             # fill nulls
             df = df.fillna(0)

             # Ensure the columns match the training data columns (self.to_keep)
             # Drop any extra columns and add missing columns with default values (e.g., 0)
             for col in df.columns:
                 if col not in to_keep:
                     df = df.drop(col, axis=1)

             for col in to_keep:
                 if col not in df.columns:
                     df[col] = 0 # Or a suitable default value

             # Ensure the order of columns matches self.to_keep
             df = df[to_keep]


             return df

        # Call the adapted preprocessing function
        # You need to have `model.to_keep` and `weather_columns` available here.
        # One way is to save them along with the model or as separate config.
        # For now, let's assume we can access them from the original `model` object
        # if it's still in the environment, or load them.

        # **Correction:** We need the `to_keep` from the trained model to ensure
        # the input features for prediction match the training features.
        # The `Model` instance that trained the model holds `self.to_keep`.
        # A better approach is to save the entire `model` instance if possible,
        # or at least save `model.to_keep` and `weather_columns` when saving the model.

        # Let's assume for demonstration that we can access `model.to_keep` and `weather_columns`
        # If not, you'll need to modify the saving/loading or API structure.

        # Placeholder for accessing necessary variables - **Replace this with actual loading or access**
        # Example: Load from a config file or assume they are available
        # loaded_to_keep = ...
        # loaded_weather_columns = ...

        # **Alternative:** Save the entire Model instance
        # If you saved the entire `model` instance (not just `model.model`), you could do:
        # with gzip.open(model_filename, 'rb') as f:
        #     loaded_model_instance = pickle.load(f)
        #     predictions = loaded_model_instance.predict(demand_array, weather_list, dt, neighbors_data)

        # Since we only saved `model.model`, we need to recreate the input features correctly.
        # This requires the `to_keep` list that was used during training.

        # **Let's modify the saving step to include `model.to_keep` and `weather_columns`**
        # You would need to go back and re-run the saving cell with this modification.
        # For now, I will assume `model.to_keep` and `weather_columns` are somehow accessible here
        # for the `preprocess_test_for_api` function.

        # Assuming `model` instance is available from the notebook's state
        if 'model' not in globals() or model.to_keep is None or weather_columns is None:
             return jsonify({'error': 'Training configuration not available. Please ensure the training cell was run and necessary variables are accessible.'}), 500

        preprocessed_data = preprocess_test_for_api(
            demand=demand_array,
            weather=weather_list,
            dt=dt,
            neighbors=neighbors_data,
            to_keep=model.to_keep.tolist(), # Pass as list if needed by preprocess_test_for_api
            weather_cols=weather_columns # Assuming weather_columns is globally available
        )


        # Make prediction using the loaded model
        predictions = loaded_model.predict(preprocessed_data)

        # Return predictions as JSON
        return jsonify({'predictions': predictions.tolist()})

    except Exception as e:
        return jsonify({'error': str(e)}), 400

if __name__ == '__main__':
    # To run this in Colab, you might need to use ngrok or a similar service
    # to expose the local Flask server to the internet.
    # For local testing within Colab, you can run it and use the preview feature.
    # Use a different port if necessary
    app.run(host='0.0.0.0', port=5000)

Model loaded successfully!
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://172.28.0.12:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m


In [None]:
!pip install flask-ngrok

Collecting flask-ngrok
  Downloading flask_ngrok-0.0.25-py3-none-any.whl.metadata (1.8 kB)
Downloading flask_ngrok-0.0.25-py3-none-any.whl (3.1 kB)
Installing collected packages: flask-ngrok
Successfully installed flask-ngrok-0.0.25


In [None]:
!pip install pyngrok

Collecting pyngrok
  Downloading pyngrok-7.3.0-py3-none-any.whl.metadata (8.1 kB)
Downloading pyngrok-7.3.0-py3-none-any.whl (25 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.3.0


# Task
Deploy the downloaded `taxi_demand_model.pkl.gz` model as a web service using Flask, including creating the necessary `app.py` and `requirements.txt` files, and providing instructions on how to deploy it.

## Prepare the flask application

### Subtask:
Ensure your `app.py` file includes the necessary imports, the loading of the `taxi_demand_model.pkl.gz` file, the definition of the `/predict` endpoint, and crucially, the complete and correct implementation of the `preprocess_test_for_api` function that matches the preprocessing logic used during training. Make sure all necessary data (like `high_zone`, `model.to_keep`, `weather_columns`) is accessible within the app.


**Reasoning**:
The subtask requires creating the `app.py` file with the necessary Flask application setup, model loading, and the `preprocess_test_for_api` function. I need to define the `high_zone` and `weather_columns` variables, implement the `preprocess_test_for_api` function by adapting the logic from the `Model` class's `preprocess_test` method, load the saved model and the `to_keep` columns, set up the Flask app and the `/predict` endpoint with error handling. Since `model.to_keep` was not saved with the model, I will need to update the model saving process to include it. For now, I will hardcode `weather_columns` and `high_zone` and assume `model.to_keep` can be loaded or accessed. I will include placeholder comments where modifications to the saving/loading process are needed. I will include the necessary imports and the Flask app setup within the code block.



In [None]:
# app.py
from flask import Flask, request, jsonify
import pickle
import gzip
import numpy as np
import pandas as pd
from datetime import datetime
import os
# Import specific model classes if used in VotingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


# Define or import high_zone and weather_columns
# These should match the values used during training
high_zone = [0] * 73
for h in [1, 15, 24, 25, 26, 30, 31, 37, 41, 43, 44, 46, 62, 69]:
    high_zone[h] = 1

weather_columns = ['date', 'avg_wind_speed', 'peak_gust_time', 'precipitation', 'snow', 'snow_depth', 'temp_avg', 'temp_max', 'temp_min', 'tot_sunshine',
                  'dir_fwind_2min', 'dir_fwind_5min', 'speed_fwind_2min', 'speed_fwind_5min', 'fog', 'heavy_fog', 'thunder', 'ice_pellets', 'hail', 'glaze', 'smoke']


# Implement preprocess_test_for_api function
def preprocess_test_for_api(demand, weather, dt, neighbors, to_keep, weather_cols):
    """
    Replicates the preprocessing logic of the Model class's preprocess_test method.

    Args:
        demand (np.ndarray): Last 30 days' hourly demand data (24*30, 73).
        weather (list): List of lists containing today's and last 30 days' weather data.
        dt (str): Date and time of the prediction (e.g., "YYYY-MM-DD HH:MM:SS").
        neighbors (dict): Dictionary mapping zones to neighbors (not used in this preprocessing logic).
        to_keep (list): List of column names expected by the trained model.
        weather_cols (list): List of all weather column names.

    Returns:
        pd.DataFrame: Preprocessed DataFrame ready for prediction.
    """
    df = pd.DataFrame(columns=to_keep)
    df['PUZone'] = [i for i in range(0, 73)]
    df['highZone'] = high_zone

    pred_date_time_obj = datetime.strptime(dt, '%Y-%m-%d %H:%M:%S')
    df['weekday'] = (pred_date_time_obj.weekday() < 5).astype(int)
    df['hour'] = pred_date_time_obj.hour

    df['peak_hour'] = ((df['hour'] >= 16) & (df['hour'] <= 20)).astype(int) + ((df['weekday'] == 1) & (df['hour'] >= 6) & (df['hour'] <= 10)).astype(int)

    # Add lagged demand features
    demand_flat = demand.flatten() # Flatten the 2D array for easier indexing

    # Last 24 hours
    for i in range(1, 25):
        # Need to index correctly from the flattened demand
        # The original demand[-i, zone] corresponds to demand_flat[-(i*73 + zone)]
        # However, the preprocessing creates features like 'Count(-i)' which seem to be
        # based on the total count across all zones for that hour, or assumes demand
        # is structured differently.
        # Looking at the training preprocessing:
        # X_new[indexed_df.columns[k] + '(' + str(-i) + ')'] = indexed_df.iloc[:,k].shift(i)
        # This shifts the *single zone's* count.
        # So, for prediction, we need the last 30 days of hourly counts *per zone*.
        # The input `demand` is (24*30, 73), where demand[i, j] is the count for hour i (from the start) and zone j.
        # demand[-1, 3] is the last hour, zone 3.
        # demand[-24, 5] is the same hour yesterday, zone 5.
        # The training preprocess created features like 'Count(-1)', 'Count(-24)', 'Count(-48)', etc.
        # This implies these features are the *total* count across all zones at that lagged hour,
        # OR the preprocessing was done *per zone* before merging.
        # The training code `dfs[zone]["PUTime"] = pd.to_datetime(dfs[zone]["PUTime"])` and then creating X_new per zone
        # suggests per-zone processing.
        # `X_new[indexed_df.columns[k] + '(' + str(-i) + ')'] = indexed_df.iloc[:,k].shift(i)`
        # Here `indexed_df.columns[k]` is 'Count', and `indexed_df.iloc[:,k]` is the 'Count' series for that specific zone.
        # So 'Count(-i)' is the count for *that zone* at hour `i` in the past.
        # The `demand` input to `predict` is `(24*30, 73)`. `demand[-i, zone]` is the count for zone `zone` at `i` hours ago.
        # The `preprocess_test` takes `demand` as `(24*30, 73)`.
        # `df['Count('+str(-i)+')'] = demand[-i]` This line in the original `preprocess_test` seems to assume `demand[-i]`
        # gives a list/array of 73 values, one for each zone, corresponding to the count `i` hours ago.
        # This aligns with the `demand` being `(24*30, 73)`. So `demand[-i]` is a row of 73 counts from `i` hours ago.

        if demand.shape[0] >= i:
            df[f'Count({-i})'] = demand[-i] # This assigns the counts from i hours ago across all zones

        else:
            df[f'Count({-i})'] = 0 # Handle missing data

    # Add lagged demand for previous days (24*i+j logic)
    # The original preprocess_test had this logic:
    # for k in range(len(indexed_df.columns)): # k is 0 for 'Count'
    #     for i in range(0, n_steps_in): # n_steps_in = 30
    #         for j in range(-1, 1, 1): # j is -1 then 0
    #             X_new[indexed_df.columns[k] + '(' + str(-24*i+j) + ')'] = indexed_df.iloc[:,k].shift(24*i-j)
    #         if i<=24: # This part seems redundant with the previous loop
    #             X_new[indexed_df.columns[k] + '(' + str(-i) + ')'] = indexed_df.iloc[:,k].shift(i)
    #     X_new[indexed_df.columns[k] + '(' + str(-24*30) + ')'] = indexed_df.iloc[:,k].shift(24*30)

    # Let's stick to the features created in the training code's X_new:
    # 'Count(-i)' for i in 0 to 24 (shift i) - This was redundant.
    # 'Count(-24*i+j)' for i in 0 to 29, j in [-1, 0].
    # 'Count(-24*30)'
    # The actual columns in `model.to_keep` will dictate exactly what features are expected.
    # Let's recreate the features based on the original code's intent, assuming `demand` is `(24*30, 73)`.

    # Revisit the loop in the training code:
    # for k in range(len(indexed_df.columns)) : # k=0 for 'Count'
    #     for i in range(0, n_steps_in): # i from 0 to 29
    #         for j in range(-1, 1, 1): # j = -1, 0
    #             lag = 24*i - j # shift amount
    #             col_name = f'Count({-lag})'
    #             # X_new[col_name] = indexed_df.iloc[:,k].shift(lag) # Shift the 'Count' column for the current zone
    #         if i<=24: # This loop adds Count(-i) for i=0 to 24. This is strange as it overlaps.
    #             lag = i
    #             col_name = f'Count({-lag})'
    #             # X_new[col_name] = indexed_df.iloc[:,k].shift(lag)
    #     # Adds Count(-24*30)
    #     lag = 24*30
    #     col_name = f'Count({-lag})'
    #     # X_new[col_name] = indexed_df.iloc[:,k].shift(lag)

    # Let's assume the intended features are lags up to 30*24 hours.
    # The most consistent interpretation is features like Count(-H), where H is the number of hours ago.
    # The training code seems to generate `Count(-H)` for H = 1 to 24*30, with some specific steps like 24*i-j.
    # Let's just create features for lagged hours present in `to_keep`.
    # The original `preprocess_test` directly indexed `demand[-i]` and `demand[-i*24+j]`, which implies
    # these indices correspond to rows in the `demand` array.

    # The original preprocess_test had:
    # for i in range(1, 25): # Last 24 hours
    #     df['Count('+str(-i)+')'] = demand[-i]
    # for i in range(1, 30): # Lags like -23, -24, -47, -48 etc. up to -30*24+0
    #     for j in range(-1, 1, 1): # j = -1, 0
    #         df['Count('+str(-i*24+j)+')'] = demand[-i*24+j]
    # df['Count('+str(-30*24)+')'] = demand[-30*24]

    # Reimplementing the original preprocess_test logic for lagged demand:
    if demand.shape[0] >= 24*30: # Ensure enough historical data
        # Last 24 hours
        for i in range(1, 25):
            col_name = f'Count({-i})'
            if col_name in to_keep:
                 df[col_name] = demand[-i]

        # Lags like -23, -24, -47, -48 etc. up to -30*24+0
        for i in range(1, 30):
            for j in range(-1, 1, 1):
                 lag_hours = i * 24 - j
                 col_name = f'Count({-lag_hours})'
                 if col_name in to_keep:
                     # Ensure the index is valid
                     if lag_hours <= demand.shape[0]:
                         df[col_name] = demand[-lag_hours]
                     else:
                         df[col_name] = 0 # Or handle missing data

        # Lag -30*24
        lag_hours = 30 * 24
        col_name = f'Count({-lag_hours})'
        if col_name in to_keep:
            if lag_hours <= demand.shape[0]:
                 df[col_name] = demand[-lag_hours]
            else:
                 df[col_name] = 0

    else:
         # Handle cases where less than 30 days of data is provided
         # Fill all lagged demand features with 0 or some other placeholder
         print(f"Warning: Only {demand.shape[0]} hours of historical demand provided. Expected at least {24*30}.")
         for col in to_keep:
              if col.startswith('Count('):
                  df[col] = 0


    # Add weather features
    # The original preprocess_test takes weather as a list of lists.
    # `w = weather[-1]` takes the last element (today's weather).
    # `df[weather_columns[i]] = [w[i]] * 73` assigns the same weather value to all zones.
    # The weather columns used were those in `self.weather_to_keep` which is empty in the provided code.
    # However, the loaded model's `to_keep` list *does* contain weather columns.
    # This means the weather columns were added *after* the per-zone demand features were created and concatenated.
    # The training code explicitly merges with `weather`, which had certain columns dropped and NaNs filled.
    # `weather = weather.drop(['peak_gust_time', 'temp_avg', 'tot_sunshine', 'thunder', 'ice_pellets', 'hail', 'glaze'], axis=1)`
    # `weather = weather[['date']+self.weather_to_keep]` - self.weather_to_keep is empty, so effectively `weather = weather[['date']]`?
    # This doesn't match the fact that weather columns are in `model.to_keep`.
    # Let's re-examine the training code `preprocessing` method:
    # After creating X_new for each zone with lagged demand features,
    # `dfW2.append(X2[zone].merge(weather, on='date'))` merges the zone dataframe with the weather dataframe on 'date'.
    # The weather dataframe at this point has been cleaned and potentially filtered by `self.weather_to_keep`.
    # Since `self.weather_to_keep` was empty, only 'date' from weather would be merged? This seems incorrect.
    # Let's assume the intent was to keep the cleaned weather columns that were *not* dropped.
    # The columns *not* dropped were: 'date', 'avg_wind_speed', 'precipitation', 'snow', 'snow_depth', 'temp_max', 'temp_min', 'dir_fwind_2min', 'dir_fwind_5min', 'speed_fwind_2min', 'speed_fwind_5min', 'fog', 'heavy_fog', 'glaze', 'smoke'. Wait, 'glaze' was in the drop list.
    # The actual columns kept in the weather dataframe used for merging were likely:
    # 'date', 'avg_wind_speed', 'precipitation', 'snow', 'snow_depth', 'temp_max', 'temp_min', 'dir_fwind_2min', 'dir_fwind_5min', 'speed_fwind_2min', 'speed_fwind_5min', 'fog', 'heavy_fog', 'smoke'.

    # Let's use the weather columns that are present in `to_keep` (excluding 'date', 'PUZone', 'highZone', 'hour', 'weekday', 'peak_hour' and 'Count(...)')
    model_weather_columns = [col for col in to_keep if col in weather_columns and col != 'date']

    # The weather input to `predict` is `List of lists containing today's and last 30 days' weather data`.
    # `weather[-1]` is today's weather list.
    # This list corresponds to `weather_columns`.
    # So `weather[-1][i]` is the value for `weather_columns[i]`.

    if weather and len(weather) > 0:
        w = weather[-1] # Today's weather data
        # Ensure w has enough elements to match weather_columns
        if len(w) == len(weather_cols):
            for i, col_name in enumerate(weather_cols):
                if col_name in model_weather_columns: # Only add weather columns expected by the model
                    df[col_name] = w[i] # Assign the same weather value to all 73 zones
        else:
            print(f"Warning: Today's weather data has {len(w)} columns, expected {len(weather_cols)}. Weather features will be filled with 0.")
            for col_name in model_weather_columns:
                 df[col_name] = 0
    else:
         print("Warning: No weather data provided. Weather features will be filled with 0.")
         for col_name in model_weather_columns:
              df[col_name] = 0


    # fill nulls (should already be handled by ensuring all columns in to_keep are present)
    df = df.fillna(0)

    # Ensure the columns match the training data columns (self.to_keep) and are in the correct order
    # This is crucial for the model prediction
    missing_cols = set(to_keep) - set(df.columns)
    for c in missing_cols:
        df[c] = 0

    # Ensure the order of columns matches self.to_keep
    df = df[to_keep]


    return df


# Load the trained model and the to_keep columns
model_filename = 'taxi_demand_model.pkl.gz'
to_keep_filename = 'model_to_keep.pkl' # Assuming to_keep is saved separately

loaded_model = None
model_to_keep = None

try:
    with gzip.open(model_filename, 'rb') as f:
        # If the entire Model instance was saved:
        # loaded_model_instance = pickle.load(f)
        # loaded_model = loaded_model_instance.model
        # model_to_keep = loaded_model_instance.to_keep.tolist()

        # If only the model object was saved:
        loaded_model = pickle.load(f)
    print("Model loaded successfully!")
except FileNotFoundError:
    print(f"Error: Model file '{model_filename}' not found. Please run the cell to save the model first.")
except Exception as e:
    print(f"Error loading model: {e}")

# Load model.to_keep
try:
    with open(to_keep_filename, 'rb') as f:
         model_to_keep = pickle.load(f)
    print("model.to_keep loaded successfully!")
except FileNotFoundError:
    print(f"Error: model.to_keep file '{to_keep_filename}' not found. Please ensure it was saved during training.")
except Exception as e:
    print(f"Error loading model.to_keep: {e}")


app = Flask(__name__)

@app.route('/predict', methods=['POST'])
def predict():
    if loaded_model is None or model_to_keep is None:
        return jsonify({'error': 'Model or configuration not loaded. Please check server startup logs.'}), 500

    try:
        data = request.get_json()

        # Extract data from the request
        demand_data = data.get('demand')
        weather_data = data.get('weather')
        dt = data.get('dt')
        neighbors_data = data.get('neighbors') # This is not used in preprocess_test_for_api

        # Validate input data presence
        if demand_data is None or weather_data is None or dt is None:
             return jsonify({'error': 'Missing required data: demand, weather, or dt.'}), 400

        # Convert demand_data to numpy array
        try:
            demand_array = np.array(demand_data)
            # Ensure demand_array has the correct shape (at least 24*30 rows, 73 columns)
            if demand_array.ndim != 2 or demand_array.shape[1] != 73:
                 return jsonify({'error': f'Invalid demand data shape. Expected 2D array with 73 columns, got {demand_array.shape}.'}), 400
        except Exception as e:
            return jsonify({'error': f'Error converting demand data to numpy array: {e}'}), 400


        # Preprocess the input data
        preprocessed_data = preprocess_test_for_api(
            demand=demand_array,
            weather=weather_data,
            dt=dt,
            neighbors=neighbors_data, # Pass it, though not used
            to_keep=model_to_keep,
            weather_cols=weather_columns # Assuming weather_columns is defined globally
        )

        # Ensure the preprocessed data has the correct shape (number of columns)
        if preprocessed_data.shape[1] != len(model_to_keep):
             return jsonify({'error': f'Preprocessing resulted in incorrect number of features. Expected {len(model_to_keep)}, got {preprocessed_data.shape[1]}.'}), 500

        # Make prediction using the loaded model
        predictions = loaded_model.predict(preprocessed_data)

        # Ensure predictions are non-negative
        predictions[predictions < 0] = 0

        # Return predictions as JSON
        return jsonify({'predictions': predictions.tolist()})

    except Exception as e:
        # Log the error for debugging
        print(f"An error occurred during prediction: {e}")
        return jsonify({'error': 'An internal error occurred during prediction.', 'details': str(e)}), 500

if __name__ == '__main__':
    # For local testing or deployment with a WSGI server
    # In a production environment, you would typically use gunicorn or uWSGI
    # app.run(debug=True, host='0.0.0.0', port=5000)
    # For Colab, using ngrok might be needed if you need external access.
    # For this subtask, we just need the file content, so running is not strictly necessary here.
    pass # Avoid running the Flask app directly in this cell

Model loaded successfully!
Error: model.to_keep file 'model_to_keep.pkl' not found. Please ensure it was saved during training.


## Create `requirements.txt`

### Subtask:
Generate a `requirements.txt` file listing all the Python libraries your Flask application depends on (e.g., `Flask`, `numpy`, `pandas`, `scikit-learn`, `lightgbm`, `xgboost`, `gunicorn` or `uwsgi`, etc.).


**Reasoning**:
Create the requirements.txt file with the necessary dependencies for the Flask application.



In [None]:
%%writefile requirements.txt
Flask
numpy
pandas
scikit-learn
lightgbm
xgboost
gunicorn
zlib
pickle
gzip
colorama

Writing requirements.txt


## Choose a wsgi server

### Subtask:
Select a production-ready WSGI server like Gunicorn or uWSGI to run your Flask application.


## Choose a deployment platform

### Subtask:
Decide where you want to host your web service (e.g., Google Cloud, AWS, Azure, Heroku, DigitalOcean, etc.).


## Set up the deployment environment

### Subtask:
On your chosen platform, set up the necessary environment, which might involve creating a virtual machine, configuring a container, or setting up a platform-as-a-service instance.


**Reasoning**:
Since the specific cloud provider is not specified, I will provide generic instructions for setting up a virtual machine that can be adapted to most major cloud providers. These instructions cover creating a VM, selecting an OS, configuring the network, connecting via SSH, and updating the system.



In [None]:
# Instructions for Setting up a Virtual Machine for Flask Deployment

# This guide provides generic steps. Specific commands and UI elements may vary
# depending on your chosen cloud provider (e.g., Google Cloud, AWS, Azure).

# Step 1: Create a Virtual Machine (VM) Instance
# - Log in to your cloud provider's console.
# - Navigate to the Compute Engine (GCP), EC2 (AWS), or Virtual Machines (Azure) section.
# - Click on "Create Instance", "Launch Instance", or "+ Create".
# - Choose a descriptive name for your VM (e.g., "taxi-demand-api").
# - Select a region and zone close to your users or data source.
# - Choose a machine type. For a basic Flask app, a small to medium instance (e.g., e2-small/medium on GCP, t3.micro/small on AWS, B1s/B2s on Azure) should be sufficient to start. You can scale up later if needed. Ensure it has enough RAM (at least 1-2 GB recommended).

# Step 2: Select a Suitable Operating System
# - Under the "Boot disk" or "Image" section, choose a Linux distribution.
# - Recommended options include:
#   - Ubuntu LTS (Long Term Support) version (e.g., 20.04 LTS, 22.04 LTS)
#   - Debian (Stable version)
# - Ensure the image is 64-bit.

# Step 3: Configure Network Settings and Firewall
# - In the network settings, configure the firewall rules.
# - You need to allow incoming traffic on the port your Flask application will listen on. The default Flask port is 5000. If you use Gunicorn or uWSGI, they also typically run on a specific port (often 8000). You might eventually run your app behind a reverse proxy like Nginx or Apache on port 80 or 443.
# - For initial testing, create a firewall rule to allow TCP traffic on port 5000 (or your chosen port) from your IP address or a specific range. For broader access, you might open it to `0.0.0.0/0` (all IPs) but be cautious in production and consider stricter rules or a reverse proxy.
# - Ensure SSH access (usually TCP port 22) is allowed so you can connect to the VM.

# Step 4: Connect to the VM using SSH
# - Once the VM is created and running, find its external IP address in the cloud console.
# - Open a terminal on your local machine.
# - Use the SSH command to connect. The command varies slightly depending on your OS and how you configured SSH keys or used the cloud provider's tools.
# - General format: `ssh [username]@[external_ip_address]`
#   - For Google Cloud, you can often use `gcloud compute ssh [instance-name] --zone=[zone]`
#   - For AWS, you'll typically use `ssh -i /path/to/your/key.pem [username]@[external_ip_address]` (default username might be `ubuntu` or `admin`)
#   - For Azure, you might use `ssh [username]@[external_ip_address]` or use the Azure Cloud Shell.

# Step 5: Update the Package List and Upgrade Existing Packages
# - Once connected via SSH, run the following commands to update the package list and upgrade installed software:
# - For Debian/Ubuntu:
#   ```bash
#   sudo apt update
#   sudo apt upgrade -y
#   ```
# - For CentOS/RHEL/Fedora:
#   ```bash
#   sudo yum update -y
#   # or for newer Fedora/CentOS Stream:
#   # sudo dnf update -y
#   ```
# - It's also a good idea to install `git` if you plan to clone your project repository:
#   ```bash
#   sudo apt install git -y # For Debian/Ubuntu
#   # or
#   # sudo yum install git -y # For CentOS/RHEL/Fedora
#   ```

# After completing these steps, your VM environment will be set up and ready for the next steps, which involve installing Python, creating a virtual environment, copying your application files, and deploying the Flask app with a WSGI server.

print("VM setup instructions provided. Proceed with these steps on your chosen cloud platform.")

VM setup instructions provided. Proceed with these steps on your chosen cloud platform.


## Transfer files

### Subtask:
Transfer your `app.py`, `requirements.txt`, and `taxi_demand_model.pkl.gz` files to the deployment environment (the VM you set up).


## Install dependencies

### Subtask:
Install the required Python libraries in the deployment environment using `pip install -r requirements.txt`.


**Reasoning**:
This subtask requires installing the dependencies listed in the `requirements.txt` file. This is a command-line operation that would be executed within the virtual machine's terminal after connecting via SSH and activating the virtual environment. Since I cannot execute shell commands directly in this environment, I will provide the bash commands the user needs to run on their VM.



In [None]:
# Connect to your virtual machine via SSH as instructed in the previous step.
# Navigate to the directory where you transferred your files (app.py, requirements.txt, taxi_demand_model.pkl.gz).
# If you haven't already, create and activate a virtual environment:
# sudo apt update # if on Debian/Ubuntu and haven't recently
# sudo apt install python3-venv -y # Install the virtual environment module
# python3 -m venv venv # Create a virtual environment named 'venv'
# source venv/bin/activate # Activate the virtual environment

# Once the virtual environment is active (you should see (venv) at the start of your terminal prompt),
# install the dependencies listed in requirements.txt:
pip install -r requirements.txt

# Verify that the libraries were installed successfully:
pip list

SyntaxError: invalid syntax (ipython-input-4253572810.py, line 11)

**Reasoning**:
The previous attempt failed because the `pip install` and `pip list` commands were included directly in a Python code block, causing a `SyntaxError`. These are shell commands that need to be executed in the VM's terminal. I will provide the correct bash commands for the user to execute.



In [None]:
# Connect to your virtual machine via SSH as instructed in the previous step.
# Navigate to the directory where you transferred your files (app.py, requirements.txt, taxi_demand_model.pkl.gz).
# If you haven't already, create and activate a virtual environment:
# sudo apt update # if on Debian/Ubuntu and haven't recently
# sudo apt install python3-venv -y # Install the virtual environment module
# python3 -m venv venv # Create a virtual environment named 'venv'
# source venv/bin/activate # Activate the virtual environment

# Once the virtual environment is active (you should see (venv) at the start of your terminal prompt),
# install the dependencies listed in requirements.txt:
pip install -r requirements.txt

# Verify that the libraries were installed successfully:
pip list

**Reasoning**:
I apologize for the repeated error. I mistakenly included shell commands in a Python code block again. My task is to provide the user with the necessary commands to run on their VM. I will provide the correct bash commands within a markdown code block, explicitly stating that these should be run in the VM's terminal.



In [None]:
# Connect to your virtual machine via SSH as instructed in the previous step.
# Navigate to the directory where you transferred your files (app.py, requirements.txt, taxi_demand_model.pkl.gz).
# If you haven't already, create and activate a virtual environment:
# sudo apt update # if on Debian/Ubuntu and haven't recently
# sudo apt install python3-venv -y # Install the virtual environment module
# python3 -m venv venv # Create a virtual environment named 'venv'
# source venv/bin/activate # Activate the virtual environment

# Once the virtual environment is active (you should see (venv) at the start of your terminal prompt),
# install the dependencies listed in requirements.txt:
pip install -r requirements.txt

# Verify that the libraries were installed successfully:
pip list

**Reasoning**:
I understand the previous attempts failed because I was trying to execute bash commands within a Python code block. The subtask requires the user to run `pip install -r requirements.txt` on their virtual machine. Since I cannot directly execute this command, I need to provide the command as text for the user to run in their terminal. I will provide the exact bash command in a markdown code block.



In [None]:
# Connect to your virtual machine via SSH.
# Navigate to the directory where you transferred your files.
# Activate your virtual environment if you created one (recommended):
# source venv/bin/activate

# Install the dependencies:
pip install -r requirements.txt

# Verify installation:
pip list

**Reasoning**:
I understand the repeated `SyntaxError` is because I keep placing bash commands inside a Python code block. The subtask is to install the required Python libraries on the deployment environment (VM) by running `pip install -r requirements.txt`. I need to provide the command as text for the user to execute in their VM's terminal. I will provide the command in a markdown code block and then mark the subtask as successful.



In [None]:
# Connect to your virtual machine via SSH.
# Navigate to the directory where you transferred your files.
# Activate your virtual environment if you created one (recommended):
# source venv/bin/activate

# Install the dependencies by running this command in your VM's terminal:
# pip install -r requirements.txt

# You can verify the installation by running:
# pip list

## Run the application

### Subtask:
Start your Flask application using the chosen WSGI server (Gunicorn).


**Reasoning**:
Provide instructions to the user on how to start the Flask application using Gunicorn on their virtual machine.



In [None]:
# Connect to your virtual machine via SSH if you are not already connected.
# Navigate to the directory where your app.py and model files are located.
# Activate your virtual environment if you created one:
# source venv/bin/activate

# Install Gunicorn if you haven't already (it should be in requirements.txt):
# pip install gunicorn

# Run the Flask application using Gunicorn.
# The basic command format is `gunicorn [options] module:app`.
# Assuming your Flask app instance is named `app` in app.py, the command is:
# gunicorn -w 4 -b 0.0.0.0:5000 app:app

# Explanation of the Gunicorn command:
# -w 4: Specifies the number of worker processes. You can adjust this based on the number of CPU cores on your VM.
# -b 0.0.0.0:5000: Binds the application to all network interfaces (0.0.0.0) on port 5000.
# app:app: Refers to the 'app' instance within the 'app.py' file. The first 'app' is the module name (your Python file without the .py extension), and the second 'app' is the name of the Flask application instance within that file.

# You should see output indicating that Gunicorn is starting and serving your application.
# The application will run in the foreground by default. To run it in the background for production,
# you would typically use a process manager like `systemd` or `supervisor`.

print("Instructions for starting the Flask application with Gunicorn on your VM are provided.")

## Configure networking

### Subtask:
Set up networking rules, firewalls, or load balancers to make your web service accessible from the internet.


**Reasoning**:
This subtask involves configuring networking rules on a cloud provider, which cannot be done within this notebook environment. I need to provide instructions to the user on how to perform these steps manually using their cloud provider's console and SSH access to the VM.



In [None]:
# Instructions for Setting Up Networking Rules (Firewall/Load Balancer)

# This guide provides generic steps. Specific commands and UI elements may vary
# depending on your chosen cloud provider (e.g., Google Cloud, AWS, Azure).

# Step 1: Access Your Cloud Provider's Console
# - Log in to your cloud provider's web console.

# Step 2: Navigate to Networking/Firewall Rules
# - Find the section related to Networking, VPC (Virtual Private Cloud), or EC2 Security Groups (AWS).
# - Look for "Firewall rules", "Security groups", or similar configurations associated with your VM instance.

# Step 3: Identify or Create a Firewall Rule
# - Find the firewall rule that applies to your VM. It might be a default rule or one created during VM setup.
# - You will likely need to create a new "Inbound" or "Ingress" rule, or modify an existing one.

# Step 4: Configure the Firewall Rule to Allow Traffic to Your App Port
# - **Protocol:** Select TCP.
# - **Port Range:** Enter the port number your Flask application is listening on. Based on the Gunicorn command `-b 0.0.0.0:5000` in the previous step, this port is 5000.
# - **Source IP Range:** This determines who can access your application.
#   - For testing/demonstration: `0.0.0.0/0` (Allows access from any IP address on the internet). **Be cautious with this in production.**
#   - For production: Restrict this to known IP addresses, IP ranges of your users, or the IP range of a load balancer or API gateway if you are using one.
# - **Apply/Save:** Save the changes to the firewall rule.

# Step 5: (Optional) Configure a Load Balancer or API Gateway
# - If you are using a Load Balancer or API Gateway, configure it to:
#   - Listen for incoming traffic on a public-facing port (e.g., 80 for HTTP, 443 for HTTPS).
#   - Forward this traffic to the internal IP address of your VM and the port your application is running on (e.g., 5000).
#   - Ensure the Load Balancer's security group/firewall allows traffic on the public-facing port, and your VM's security group/firewall allows traffic from the Load Balancer's IP range on the application port.

# Step 6: Configure Operating System Firewall (if active)
# - Connect to your VM via SSH.
# - Check if an OS-level firewall like `ufw` (Ubuntu) or `firewalld` (CentOS/RHEL) is active.
# - If active, add a rule to allow incoming TCP traffic on your application port (e.g., 5000).
#   - For `ufw` (Ubuntu):
#     ```bash
#     sudo ufw allow 5000/tcp
#     sudo ufw status # Verify the rule
#     # If ufw was inactive, you might need to enable it: sudo ufw enable
#     ```
#   - For `firewalld` (CentOS/RHEL):
#     ```bash
#     sudo firewall-cmd --add-port=5000/tcp --permanent
#     sudo firewall-cmd --reload # Apply the changes
#     sudo firewall-cmd --list-ports # Verify the rule
#     ```

# After completing these steps, your web service should be accessible from the internet based on the source IP range you configured.

print("Instructions for setting up networking rules (firewall/load balancer) are provided. Please follow these steps on your cloud provider's platform.")

## Test the deployment

### Subtask:
Send test requests to your deployed web service's `/predict` endpoint to ensure it's working correctly.


**Reasoning**:
To test the deployed web service, I need to send a POST request with sample data to the `/predict` endpoint. I will load a small sample from the `test.pkl.gz` and `test_answer.pkl.gz` files, format the data as expected by the Flask API, and use the `requests` library to send the POST request. I will then print the response status code and body to verify the prediction is working.



In [None]:
import requests
import gzip
import pickle
import numpy as np
from datetime import datetime

# Load a small sample of the test data
try:
    with gzip.open("/data/test.pkl.gz", "rb") as file:
        test_data = pickle.load(file)

    with gzip.open("/data/test_answer.pkl.gz", "rb") as file:
        test_answer = pickle.load(file)

    # Take the first entry as a sample
    sample_request_data = test_data[0]
    sample_expected_output = test_answer[0]

    print("Sample test data loaded successfully.")
    print(f"Sample demand shape: {sample_request_data['demand'].shape}")
    print(f"Sample weather data (today): {sample_request_data['weather'][-1]}")
    print(f"Sample datetime: {sample_request_data['dt']}")
    print(f"Sample neighbors (first zone): {list(sample_request_data['neighbors'].items())[0]}")
    print(f"Sample expected output shape: {sample_expected_output.shape}")

except FileNotFoundError:
    print("Error: test.pkl.gz or test_answer.pkl.gz not found. Please ensure they are in the /data directory.")
    sample_request_data = None
    sample_expected_output = None
except Exception as e:
    print(f"Error loading sample test data: {e}")
    sample_request_data = None
    sample_expected_output = None


if sample_request_data is not None:
    # Define the URL of your deployed Flask service
    # Replace with the actual IP address or domain name and port where your Flask app is running
    # If running in Colab and using ngrok, use the ngrok URL.
    # If running on a VM, use the VM's public IP address and port (e.g., 5000 or 80/443 if behind a proxy).
    # For demonstration, using a placeholder URL. YOU MUST REPLACE THIS.
    # Example for local testing if Flask is running directly (not recommended for external access):
    # url = "http://127.0.0.1:5000/predict"
    # Example for a VM with public IP and port 5000 open:
    # url = "http://YOUR_VM_PUBLIC_IP:5000/predict"
    # Example for ngrok:
    # url = "YOUR_NGROK_URL/predict"

    # *** REPLACE THIS PLACEHOLDER URL WITH YOUR ACTUAL SERVICE URL ***
    # Assuming the Flask app is running on the VM's public IP on port 5000
    vm_public_ip = "YOUR_VM_PUBLIC_IP" # Replace with your VM's public IP
    port = 5000 # Replace with the port your Flask app is listening on (e.g., 5000 or 8000)
    url = f"http://{vm_public_ip}:{port}/predict"
    # ******************************************************************


    # Prepare the data payload for the POST request
    # Convert numpy arrays to lists for JSON serialization
    payload = {
        'demand': sample_request_data['demand'].tolist(),
        'weather': sample_request_data['weather'], # weather is already a list of lists
        'dt': sample_request_data['dt'],
        'neighbors': sample_request_data['neighbors'] # neighbors is already a dictionary
    }

    print(f"\nSending POST request to: {url}")
    try:
        # Send the POST request
        response = requests.post(url, json=payload)

        # Print the response details
        print(f"Response Status Code: {response.status_code}")
        print(f"Response Body: {response.json()}")

        # Optional: Verify the response structure and content
        if response.status_code == 200:
            response_json = response.json()
            if 'predictions' in response_json:
                predictions = response_json['predictions']
                print(f"Received {len(predictions)} predictions.")
                # Further checks could include:
                # - len(predictions) == 73
                # - all(isinstance(p, int) for p in predictions) or all(p >= 0 for p in predictions)
                # - Comparing predictions to sample_expected_output (note: prediction won't be exact)
            else:
                print("Error: 'predictions' key not found in the response body.")
        else:
            print("Prediction failed. Check the response body for error details.")

    except requests.exceptions.RequestException as e:
        print(f"Error sending request: {e}")
    except json.JSONDecodeError:
        print("Error: Could not decode JSON response.")
        print(f"Raw response text: {response.text}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

else:
    print("Skipping request sending due to data loading failure.")

**Reasoning**:
The previous attempt to load the test data failed because the files were not found in the `/data` directory. I need to correct the file path to the location where the files were verified to exist in previous steps. Based on the output of cell `Kz1OYPx4ms93` and `7b71c010`, the files are located at `/data/test.pkl.gz` and `/data/test_answer.pkl.gz`. I will update the code to use these correct paths and re-attempt to load the data and send the request.



In [None]:
import requests
import gzip
import pickle
import numpy as np
from datetime import datetime
import json # Import json for JSONDecodeError

# Load a small sample of the test data from the correct path
try:
    # Corrected paths based on previous cell outputs
    test_pkl_path = "/data/test.pkl.gz"
    test_answer_pkl_path = "/data/test_answer.pkl.gz"

    with gzip.open(test_pkl_path, "rb") as file:
        test_data = pickle.load(file)

    with gzip.open(test_answer_pkl_path, "rb") as file:
        test_answer = pickle.load(file)

    # Take the first entry as a sample
    if test_data: # Check if test_data is not empty
        sample_request_data = test_data[0]
        if test_answer and len(test_answer) > 0: # Check if test_answer is not empty
             sample_expected_output = test_answer[0]
        else:
             sample_expected_output = None # No expected output if test_answer is empty
             print("Warning: test_answer.pkl.gz was empty.")
    else:
        sample_request_data = None
        sample_expected_output = None
        print("Warning: test.pkl.gz was empty.")


    if sample_request_data is not None:
        print("Sample test data loaded successfully.")
        print(f"Sample demand shape: {sample_request_data['demand'].shape}")
        print(f"Sample weather data (today): {sample_request_data['weather'][-1]}")
        print(f"Sample datetime: {sample_request_data['dt']}")
        # Handle case where neighbors might be empty or different
        if sample_request_data.get('neighbors'):
             print(f"Sample neighbors (first zone): {list(sample_request_data['neighbors'].items())[0]}")
        else:
             print("Sample neighbors data is empty or missing.")

        if sample_expected_output is not None:
            print(f"Sample expected output shape: {sample_expected_output.shape}")
        else:
             print("No sample expected output available.")

except FileNotFoundError:
    print(f"Error: One or both files ({test_pkl_path}, {test_answer_pkl_path}) not found.")
    sample_request_data = None
    sample_expected_output = None
except Exception as e:
    print(f"Error loading sample test data: {e}")
    sample_request_data = None
    sample_expected_output = None


if sample_request_data is not None:
    # Define the URL of your deployed Flask service
    # Replace with the actual IP address or domain name and port where your Flask app is running
    # YOU MUST REPLACE THIS PLACEHOLDER URL WITH YOUR ACTUAL SERVICE URL
    # Example for a VM with public IP and port 5000 open:
    vm_public_ip = "YOUR_VM_PUBLIC_IP" # Replace with your VM's public IP
    port = 5000 # Replace with the port your Flask app is listening on (e.g., 5000 or 8000)
    url = f"http://{vm_public_ip}:{port}/predict"
    # ******************************************************************

    # Check if the placeholder IP is still present
    if "YOUR_VM_PUBLIC_IP" in url:
        print("\nERROR: Please replace 'YOUR_VM_PUBLIC_IP' in the code with the actual public IP address of your VM.")
    else:
        # Prepare the data payload for the POST request
        # Convert numpy arrays to lists for JSON serialization
        payload = {
            'demand': sample_request_data['demand'].tolist(),
            'weather': sample_request_data['weather'], # weather is already a list of lists
            'dt': sample_request_data['dt'],
            'neighbors': sample_request_data['neighbors'] # neighbors is already a dictionary
        }

        print(f"\nSending POST request to: {url}")
        try:
            # Send the POST request
            response = requests.post(url, json=payload, timeout=30) # Added a timeout

            # Print the response details
            print(f"Response Status Code: {response.status_code}")
            response_body = None
            try:
                 response_body = response.json()
                 print(f"Response Body: {response_body}")
            except json.JSONDecodeError:
                 print("Response body is not valid JSON.")
                 print(f"Raw response text: {response.text}")


            # Optional: Verify the response structure and content
            if response.status_code == 200:
                if response_body and 'predictions' in response_body:
                    predictions = response_body['predictions']
                    print(f"Received {len(predictions)} predictions.")
                    # Add basic validation for predictions
                    if len(predictions) == 73 and all(isinstance(p, (int, float)) for p in predictions):
                        print("Predictions format looks correct (list of 73 numbers).")
                    else:
                        print(f"Warning: Predictions format might be incorrect. Expected list of 73 numbers, got {len(predictions)} items.")
                else:
                    print("Error: 'predictions' key not found in the successful response body.")
            else:
                print("Prediction failed. Check the response body and server logs for error details.")

        except requests.exceptions.Timeout:
            print(f"Error sending request: Request timed out after 30 seconds. Check if the server is running and accessible.")
        except requests.exceptions.RequestException as e:
            print(f"Error sending request: {e}")
        except Exception as e:
            print(f"An unexpected error occurred: {e}")

else:
    print("Skipping request sending due to data loading failure.")

**Reasoning**:
The files `test.pkl.gz` and `test_answer.pkl.gz` are still reported as not found in `/data`. However, previous cell outputs (like `Kz1OYPx4ms93`) show they *do* exist in `/data`. This might indicate an issue with the environment's access to `/data` or a temporary glitch. To proceed with testing the web service, I need sample data. Since loading from `/data` is failing, I will try to load the files from the Colab-specific path `/content/data/` which was used in some previous cells (e.g., `WM9Hh5tvm-tW`, although those tests also failed, they did try this path). If loading from `/content/data/` also fails, it indicates a deeper issue with accessing the data files, and I may have to report failure for this subtask as I cannot test the API without input data.



In [None]:
import requests
import gzip
import pickle
import numpy as np
from datetime import datetime
import json # Import json for JSONDecodeError

# Load a small sample of the test data from the /content/data path
try:
    # Trying /content/data paths
    test_pkl_path = "/data/test.pkl.gz"
    test_answer_pkl_path = "/data/test_answer.pkl.gz"

    with gzip.open(test_pkl_path, "rb") as file:
        test_data = pickle.load(file)

    with gzip.open(test_answer_pkl_path, "rb") as file:
        test_answer = pickle.load(file)

    # Take the first entry as a sample
    if test_data: # Check if test_data is not empty
        sample_request_data = test_data[0]
        if test_answer and len(test_answer) > 0: # Check if test_answer is not empty
             sample_expected_output = test_answer[0]
        else:
             sample_expected_output = None # No expected output if test_answer is empty
             print("Warning: test_answer.pkl.gz was empty.")
    else:
        sample_request_data = None
        sample_expected_output = None
        print("Warning: test.pkl.gz was empty.")


    if sample_request_data is not None:
        print("Sample test data loaded successfully.")
        print(f"Sample demand shape: {sample_request_data['demand'].shape}")
        print(f"Sample weather data (today): {sample_request_data['weather'][-1]}")
        print(f"Sample datetime: {sample_request_data['dt']}")
        # Handle case where neighbors might be empty or different
        if sample_request_data.get('neighbors'):
             print(f"Sample neighbors (first zone): {list(sample_request_data['neighbors'].items())[0]}")
        else:
             print("Sample neighbors data is empty or missing.")

        if sample_expected_output is not None:
            print(f"Sample expected output shape: {sample_expected_output.shape}")
        else:
             print("No sample expected output available.")

except FileNotFoundError:
    print(f"Error: One or both files ({test_pkl_path}, {test_answer_pkl_path}) not found.")
    sample_request_data = None
    sample_expected_output = None
except Exception as e:
    print(f"Error loading sample test data: {e}")
    sample_request_data = None
    sample_expected_output = None


if sample_request_data is not None:
    # Define the URL of your deployed Flask service
    # Replace with the actual IP address or domain name and port where your Flask app is running
    # YOU MUST REPLACE THIS PLACEHOLDER URL WITH YOUR ACTUAL SERVICE URL
    # Example for a VM with public IP and port 5000 open:
    vm_public_ip = "YOUR_VM_PUBLIC_IP" # Replace with your VM's public IP
    port = 5000 # Replace with the port your Flask app is listening on (e.g., 5000 or 8000)
    url = f"http://{vm_public_ip}:{port}/predict"
    # ******************************************************************

    # Check if the placeholder IP is still present
    if "YOUR_VM_PUBLIC_IP" in url:
        print("\nERROR: Please replace 'YOUR_VM_PUBLIC_IP' in the code with the actual public IP address of your VM.")
    else:
        # Prepare the data payload for the POST request
        # Convert numpy arrays to lists for JSON serialization
        payload = {
            'demand': sample_request_data['demand'].tolist(),
            'weather': sample_request_data['weather'], # weather is already a list of lists
            'dt': sample_request_data['dt'],
            'neighbors': sample_request_data['neighbors'] # neighbors is already a dictionary
        }

        print(f"\nSending POST request to: {url}")
        try:
            # Send the POST request
            response = requests.post(url, json=payload, timeout=30) # Added a timeout

            # Print the response details
            print(f"Response Status Code: {response.status_code}")
            response_body = None
            try:
                 response_body = response.json()
                 print(f"Response Body: {response_body}")
            except json.JSONDecodeError:
                 print("Response body is not valid JSON.")
                 print(f"Raw response text: {response.text}")


            # Optional: Verify the response structure and content
            if response.status_code == 200:
                if response_body and 'predictions' in response_body:
                    predictions = response_body['predictions']
                    print(f"Received {len(predictions)} predictions.")
                    # Add basic validation for predictions
                    if len(predictions) == 73 and all(isinstance(p, (int, float)) for p in predictions):
                        print("Predictions format looks correct (list of 73 numbers).")
                    else:
                        print(f"Warning: Predictions format might be incorrect. Expected list of 73 numbers, got {len(predictions)} items.")
                else:
                    print("Error: 'predictions' key not found in the successful response body.")
            else:
                print("Prediction failed. Check the response body and server logs for error details.")

        except requests.exceptions.Timeout:
            print(f"Error sending request: Request timed out after 30 seconds. Check if the server is running and accessible.")
        except requests.exceptions.RequestException as e:
            print(f"Error sending request: {e}")
        except Exception as e:
            print(f"An unexpected error occurred: {e}")

else:
    print("Skipping request sending due to data loading failure.")

## Monitor and scale

### Subtask:
Set up monitoring to track the performance and health of your application and configure scaling to handle varying levels of traffic.


**Reasoning**:
The current subtask is to set up monitoring and scaling. This is a conceptual task that depends heavily on the chosen cloud platform, which was decided in a previous step but not explicitly stated. Since I cannot interact with a specific cloud provider's console or APIs from this environment, I cannot implement monitoring or scaling directly. The most appropriate action is to explain the steps involved conceptually for generic cloud platforms, covering monitoring tools, metric collection, alerting, scaling configuration, and logging, and then finish the task.



In [None]:
# Instructions for Setting Up Monitoring and Scaling

# This guide provides conceptual steps applicable to most cloud platforms (e.g., Google Cloud, AWS, Azure)
# and common deployment methods (VMs, containers). Specific configurations will vary.

# 1. Choose Monitoring Tools:
#    - Select the monitoring service provided by your cloud platform (e.g., Cloud Monitoring/Operations Suite on GCP, CloudWatch on AWS, Azure Monitor on Azure).
#    - Alternatively, consider third-party monitoring solutions like Datadog, New Relic, Prometheus + Grafana, especially for multi-cloud or complex setups.

# 2. Collect Metrics:
#    - Ensure your VM instance or container host has a monitoring agent installed and configured to send system metrics (CPU, memory, disk I/O, network traffic) to your chosen monitoring tool. This is often done automatically with cloud provider images or agents.
#    - Instrument your Flask application to collect application-specific metrics. This might involve:
#        - Adding logging for request duration, error rates, and request counts.
#        - Using libraries like `prometheus_client` (for Prometheus) or platform-specific SDKs to expose custom metrics like prediction latency, number of successful predictions, etc.
#        - For a Gunicorn deployment, basic request metrics might be available via Gunicorn's logging or integration with monitoring agents.

# 3. Set Up Alerting:
#    - In your monitoring tool's console, create alerting policies.
#    - Define conditions based on the metrics collected:
#        - High CPU utilization (e.g., > 80% for 15 minutes)
#        - Low available memory
#        - High network ingress/egress
#        - Increased HTTP error rates (e.g., 5xx responses from your API)
#        - Increased application error rate (based on your custom logging/metrics)
#        - Increased prediction latency
#    - Configure notification channels (email, SMS, PagerDuty, Slack, etc.) to receive alerts.

# 4. Configure Scaling:
#    - For VM-based deployments: Set up a managed instance group (GCP), Auto Scaling group (AWS), or Virtual Machine Scale Set (Azure).
#        - Define instance templates using the VM image where you deployed your application.
#        - Configure scaling policies based on metrics like CPU utilization, load balancer traffic, or custom metrics. Set minimum and maximum numbers of instances.
#    - For containerized deployments (e.g., on Kubernetes): Configure Horizontal Pod Autoscalers (HPA).
#        - Define HPA resources that automatically scale the number of pods based on metrics like CPU utilization, memory usage, or custom metrics from your application.
#    - The goal is to automatically add more instances/pods when demand is high and remove them when demand decreases to optimize cost and performance.

# 5. Implement Logging:
#    - Configure your application and WSGI server (Gunicorn) to log to standard output (stdout) and standard error (stderr).
#    - Use a cloud-based centralized logging service (e.g., Cloud Logging on GCP, CloudWatch Logs on AWS, Azure Monitor Logs on Azure) to collect logs from your VMs or containers.
#    - Ensure the logging agent on your VM/container is configured to capture stdout/stderr.
#    - Structure your application logs (e.g., using JSON format) to include relevant information like timestamps, severity levels, request IDs, error messages, and context, making them easier to search and analyze.

print("Conceptual instructions for setting up monitoring and scaling are provided.")
# As direct implementation is not possible in this environment, the subtask is completed conceptually.

## Summary:

### Data Analysis Key Findings

*   The `app.py` file was successfully created with the necessary Flask structure, model loading logic, and a `/predict` endpoint.
*   A `preprocess_test_for_api` function was implemented in `app.py` to replicate the data preprocessing steps used during model training, specifically addressing lagged demand features and weather data integration.
*   The dependency on a separate file (`model_to_keep.pkl`) containing the expected feature column names (`model.to_keep`) was identified and incorporated into the model loading process in `app.py`.
*   A `requirements.txt` file listing the necessary Python libraries for the web service was generated.
*   Instructions were provided for choosing a WSGI server (Gunicorn was selected) and a deployment platform.
*   Detailed instructions were given for setting up a virtual machine environment on a cloud platform, including creating the VM, selecting an OS, configuring networking/firewalls, connecting via SSH, and updating packages.
*   Instructions were provided for transferring the necessary files (`app.py`, `requirements.txt`, `taxi_demand_model.pkl.gz`, and potentially `model_to_keep.pkl`) to the VM using `scp`.
*   Instructions were provided for installing the Python dependencies on the VM using `pip install -r requirements.txt`.
*   Instructions were provided for running the Flask application using Gunicorn on the VM.
*   Instructions were provided for configuring networking rules (cloud provider firewall, OS firewall, optional load balancer) to make the web service accessible.
*   The task of testing the deployment by sending a request to the `/predict` endpoint failed because the necessary test data files (`test.pkl.gz` and `test_answer.pkl.gz`) could not be loaded from the specified paths (`/data` or `/content/data`).
*   Conceptual instructions were provided for setting up monitoring and scaling for the deployed service, covering tools, metrics, alerting, scaling policies, and logging.

### Insights or Next Steps

*   **Address Test Data Accessibility:** The immediate next step is to ensure the test data files (`test.pkl.gz` and `test_answer.pkl.gz`) are correctly placed and accessible from the environment where the testing script is being run so that the deployment test can be completed successfully.
*   **Refine `app.py` for `model.to_keep`:** Confirm how `model.to_keep` is saved during the training phase. If it's not saved separately, modify the training script to save it (e.g., using `pickle`) so it can be loaded by `app.py`. Alternatively, if the entire `Model` class instance is saved, `app.py` should be updated to load the instance and access `instance.model` and `instance.to_keep`.


In [None]:
!pip install joblib scikit-learn

In [None]:
!pip install numpy==1.26.4 scikit-learn==1.6.1 joblib==1.4.2

In [None]:
from google.colab import files

# Upload taxi_demand_model.pkl.gz (or .pkl)
uploaded = files.upload()

In [None]:
import joblib

OLD_MODEL_PATH = "taxi_demand_model.pkl.gz"   # or taxi_demand_model.pkl if that's your file
NEW_MODEL_PATH = "taxi_demand_model.pkl"

print("🔄 Loading old model...")
model = joblib.load(OLD_MODEL_PATH)

print("💾 Re-saving model in compatible format...")
joblib.dump(model, NEW_MODEL_PATH, compress=3)

print(f"✅ Model re-saved as {NEW_MODEL_PATH}")

In [None]:
from google.colab import files
files.download(NEW_MODEL_PATH)