## Predict-New-York-Taxi-Trip-Amount


### Read data from open dataset

In [4]:
from azureml.opendatasets import NycTlcGreen
import pandas as pd
from datetime import datetime
from dateutil.relativedelta import relativedelta

green_taxi_df = pd.DataFrame([])

start = datetime.strptime("1/1/2015","%m/%d/%Y")
end = datetime.strptime("1/31/2015","%m/%d/%Y")

for sample_month in range(12):
    temp_df_green = NycTlcGreen(start + relativedelta(months=sample_month), end + relativedelta(months=sample_month)) \
        .to_pandas_dataframe()
    green_taxi_df = green_taxi_df.append(temp_df_green.sample(2000))

[Info] read from /tmp/tmpl2wjqn9k/https/azureopendatastorage.azurefd.net/nyctlc/green/puYear=2015/puMonth=1/part-00175-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2745-1.c000.snappy.parquet
[Info] read from /tmp/tmp7jm3m5k8/https/azureopendatastorage.azurefd.net/nyctlc/green/puYear=2015/puMonth=2/part-00007-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2577-1.c000.snappy.parquet
[Info] read from /tmp/tmpwsodnt2i/https/azureopendatastorage.azurefd.net/nyctlc/green/puYear=2015/puMonth=3/part-00133-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2703-1.c000.snappy.parquet
[Info] read from /tmp/tmpcavk883v/https/azureopendatastorage.azurefd.net/nyctlc/green/puYear=2015/puMonth=4/part-00073-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2643-1.c000.snappy.parquet
[Info] read from /tmp/tmp9erns6p4/https/azureopendatastorage.azurefd.net/nyctlc/green/puYear=2015/puMonth=5/part-00177-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd30

In [5]:
green_taxi_df.head(10)

vendorID  lpepPickupDatetime   ...    puYear  puMonth
644579          2 2015-01-04 05:33:54   ...      2015        1
1198436         2 2015-01-25 18:21:25   ...      2015        1
142680          2 2015-01-13 15:45:37   ...      2015        1
891114          2 2015-01-05 22:47:11   ...      2015        1
1086485         2 2015-01-03 10:33:20   ...      2015        1
1340084         1 2015-01-20 20:28:55   ...      2015        1
863879          1 2015-01-05 08:29:31   ...      2015        1
844153          2 2015-01-04 22:39:36   ...      2015        1
1425893         2 2015-01-13 07:16:38   ...      2015        1
1346315         1 2015-01-20 23:06:03   ...      2015        1

[10 rows x 25 columns]

### Data exploration and feature engineering


In [7]:
def build_time_features(vector):
    pickup_datetime = vector[0]
    month_num = pickup_datetime.month
    day_of_month = pickup_datetime.day
    day_of_week = pickup_datetime.weekday()
    day_of_hour = pickup_datetime.hour
    return pd.Series((month_num, day_of_month, day_of_week, day_of_hour))

green_taxi_df[["month_num", "day_of_month","day_of_week", "day_of_hour"]] = green_taxi_df[["lpepPickupDatetime"]].apply(build_time_features, axis=1)

green_taxi_df.head(10)

vendorID  lpepPickupDatetime     ...      day_of_week  day_of_hour
644579          2 2015-01-04 05:33:54     ...                6            5
1198436         2 2015-01-25 18:21:25     ...                6           18
142680          2 2015-01-13 15:45:37     ...                1           15
891114          2 2015-01-05 22:47:11     ...                0           22
1086485         2 2015-01-03 10:33:20     ...                5           10
1340084         1 2015-01-20 20:28:55     ...                1           20
863879          1 2015-01-05 08:29:31     ...                0            8
844153          2 2015-01-04 22:39:36     ...                6           22
1425893         2 2015-01-13 07:16:38     ...                1            7
1346315         1 2015-01-20 23:06:03     ...                1           23

[10 rows x 29 columns]

In [8]:
columns_to_remove = ["lpepPickupDatetime", "lpepDropoffDatetime", "puLocationId", "doLocationId", "extra", "mtaTax",
                     "improvementSurcharge", "tollsAmount", "ehailFee", "tripType", "rateCodeID", 
                     "storeAndFwdFlag", "paymentType", "fareAmount", "tipAmount","puYear", "puMonth"
                    ]
for col in columns_to_remove:
    green_taxi_df.pop(col)
    
green_taxi_df.head(5)

vendorID  passengerCount     ...       day_of_week  day_of_hour
644579          2               1     ...                 6            5
1198436         2               1     ...                 6           18
142680          2               1     ...                 1           15
891114          2               1     ...                 0           22
1086485         2               1     ...                 5           10

[5 rows x 12 columns]

In [9]:
green_taxi_df.describe()

vendorID  passengerCount      ...        day_of_week   day_of_hour
count  24000.000000    24000.000000      ...       24000.000000  24000.000000
mean       1.785917        1.381208      ...           3.254667     13.602458
std        0.410193        1.059608      ...           1.957861      6.816819
min        1.000000        0.000000      ...           0.000000      0.000000
25%        2.000000        1.000000      ...           2.000000      9.000000
50%        2.000000        1.000000      ...           3.000000     15.000000
75%        2.000000        1.000000      ...           5.000000     19.000000
max        2.000000        9.000000      ...           6.000000     23.000000

[8 rows x 12 columns]

In [10]:
final_df = green_taxi_df.query("pickupLatitude>=40.53 and pickupLatitude<=40.88")
final_df = final_df.query("pickupLongitude>=-74.09 and pickupLongitude<=-73.72")
final_df = final_df.query("tripDistance>=0.25 and tripDistance<31")
final_df = final_df.query("passengerCount>0 and totalAmount>0")

columns_to_remove_for_training = ["pickupLongitude", "pickupLatitude", "dropoffLongitude", "dropoffLatitude"]
for col in columns_to_remove_for_training:
    final_df.pop(col)

644579    -73.891006
1198436   -73.953629
142680    -73.946533
891114    -73.910591
1086485   -73.949898
1340084   -73.945251
863879    -73.889587
844153    -73.954506
1425893   -73.958076
1346315   -73.863243
304048    -73.976158
1345746   -73.980453
669120    -73.961884
1486660   -73.938370
1085787   -73.844276
367270    -73.977875
1018372   -73.956589
353942    -73.966751
1213590   -73.990524
1256727   -73.956200
1221411   -73.939064
510193    -73.980919
1219948   -73.974396
1112821   -73.992294
1400587   -73.983559
737116    -73.964600
237952    -73.963425
1344762   -73.944901
665210    -73.915962
1217369   -73.948532
             ...    
1052970   -73.807533
896787    -73.940079
1460802   -73.951485
1142015   -73.992432
1339920   -73.943207
563241    -73.953926
1601722   -73.947716
844857    -73.910439
1267713   -73.974426
393674    -73.937485
1128870   -73.920326
592633    -73.992271
633992    -73.996613
1373820   -73.927765
1193259   -73.956650
1545013   -73.957222
333945    -73

In [11]:
final_df.describe()

vendorID  passengerCount      ...        day_of_week   day_of_hour
count  23268.000000    23268.000000      ...       23268.000000  23268.000000
mean       1.787519        1.382070      ...           3.258939     13.631339
std        0.409072        1.059769      ...           1.956602      6.820333
min        1.000000        1.000000      ...           0.000000      0.000000
25%        2.000000        1.000000      ...           2.000000      9.000000
50%        2.000000        1.000000      ...           3.000000     15.000000
75%        2.000000        1.000000      ...           5.000000     19.000000
max        2.000000        6.000000      ...           6.000000     23.000000

[8 rows x 8 columns]

In [12]:
display(final_df)

## AutoML setup and running experiment


In [13]:
import azureml.core
import logging
from azureml.core.workspace import Workspace
from azureml.core import Workspace
from azureml.core.experiment import Experiment
from azureml.train.automl import AutoMLConfig
import os

subscription_id = os.getenv("SUBSCRIPTION_ID", default="<subscription id>")
resource_group = os.getenv("RESOURCE_GROUP", default="<resource group>")
workspace_name = os.getenv("WORKSPACE_NAME", default="<AML workspace name>")
workspace_region = os.getenv("WORKSPACE_REGION", default="<Region>")

ws = Workspace(subscription_id = subscription_id, resource_group = resource_group, workspace_name = workspace_name)
ws.write_config()

In [14]:
from sklearn.model_selection import train_test_split

y_df = final_df.pop("totalAmount")
x_df = final_df

x_train, x_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.2, random_state=223)

In [15]:
import logging

automl_settings = {
    "iteration_timeout_minutes": 2,
    "iterations": 5,
    "primary_metric": 'spearman_correlation',
    "preprocess": True,
    "verbosity": logging.INFO,
    "n_cross_validations": 2
}

In [16]:
from azureml.train.automl import AutoMLConfig

label = "totalAmount"

automl_config = AutoMLConfig(task='regression',
                             debug_log='automl_errors_5.log',
#                             compute_target = AMLCompute,
                             X=x_train,
                             y=y_train,
#                             label_column_name = label,
                             enable_onnx_compatible_models=True,
                             **automl_settings)

In [17]:
from azureml.core.experiment import Experiment
experiment = Experiment(ws, "greentaxi-experiment1")
local_run = experiment.submit(automl_config, show_output=True)

Running on local machine
Parent Run ID: AutoML_53e22dfc-ee81-4ac2-928b-44842721f960

Current status: DatasetEvaluation. Gathering dataset statistics.
Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetFeaturizationCompleted. Completed fit featurizers and featurizing the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         High cardinality feature detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and no high cardinality features were detected.
              Learn more about high cardinality feature handling: https://aka.ms/AutomatedMLFeaturization

**************************************************************************************************

### Retrieve the best onnx model


In [18]:
best_run, onnx_mdl = local_run.get_output(return_onnx_model=True) 

In [19]:
from azureml.automl.runtime.onnx_convert.onnx_converter import OnnxConverter
onnx_fl_path = "./taxi_best_model.onnx"
OnnxConverter.save_onnx_model(onnx_mdl, onnx_fl_path) #save the best onnx model

In [20]:
with open("taxi_best_model.onnx", "wb") as f:
    f.write(onnx_mdl.SerializeToString())

1604695

### Export ONNX model


In [21]:
from azure.storage.blob import BlobServiceClient
from azure.storage.blob import BlobClient

connection_string = "DefaultEndpointsProtocol=https;AccountName=<storage_account_name>;AccountKey=<key>;EndpointSuffix=core.windows.net"
service = BlobServiceClient.from_connection_string(conn_str=connection_string)

blob = BlobClient.from_connection_string(conn_str=connection_string, container_name="<container_name>", blob_name="<folder_name>/taxi_best_model.onnx")

with open("./taxi_best_model.onnx", "rb") as data:
    blob.upload_blob(data,overwrite=True)

{'etag': '"0x8D838E4F321357F"', 'last_modified': datetime.datetime(2020, 8, 5, 2, 12, 5, tzinfo=datetime.timezone.utc), 'content_md5': bytearray(b'>\xea\xf0<\xb33\xc1\x88n\xb5\xe8\xabz\xce\xc7\xae'), 'client_request_id': '0ed3f72e-d6c1-11ea-af5a-000d3ac3c2f6', 'request_id': '8266be62-801e-008a-1bcd-6a2a6c000000', 'version': '2019-07-07', 'date': datetime.datetime(2020, 8, 5, 2, 12, 4, tzinfo=datetime.timezone.utc), 'request_server_encrypted': True, 'encryption_key_sha256': None, 'encryption_scope': None, 'error_code': None}