In [2]:
from azureml.core import Workspace, Dataset
import numpy as np 
import pandas as pd 

secret_subscription_id_value = ''
resource_group = ''
workspace_name = ''
workspace = Workspace(secret_subscription_id_value, resource_group, workspace_name)

In [3]:
import pandas as pd

df = pd.read_csv('../data/yellow_tripdata_2015-02.csv', engine='python')
df = df.sample(frac=0.2)
df_backup = df.copy(deep=True)

In [4]:
# Change Data type 
df[['tpep_pickup_datetime', 'tpep_dropoff_datetime']] = df[['tpep_pickup_datetime', 'tpep_dropoff_datetime']].apply(pd.to_datetime)
df[['VendorID','RateCodeID','payment_type', 'store_and_fwd_flag' ]] = df[['VendorID','RateCodeID','payment_type', 'store_and_fwd_flag' ]].astype(str)

df = df.drop_duplicates(keep='last')  # drop duplicates

In [5]:
# remove pickup and drops outside newyork city limits

df.drop(df.index[
        ~((df['pickup_latitude'].between(40.496115395170364, 40.91553277700258)) &
          (df['pickup_longitude'].between(-74.25559136315209, -73.7000090639354))) 
], inplace=True)

df.drop(df.index[
        ~((df['dropoff_latitude'].between(40.496115395170364, 40.91553277700258)) &
          (df['dropoff_longitude'].between(-74.25559136315209, -73.7000090639354))) 
], inplace=True)
df.shape

df = df.reset_index()

(2435102, 19)

In [6]:
# Remove the outliers - tip distance 0 to be removed

def computer_remove_outliers(df, x, remove_outlier=1):
    q1 = df[x].quantile(0.25)
    q3 = df[x].quantile(0.75)
    iqr = q3-q1
    ll = q1 - 1.5*(iqr)
    ul = q3 + 1.5*(iqr)
    print('lower limit of dist is {} and upper limit of dist is {}'.format(ll, ul))

    # remove all negative distances and distances greater than ul
    if remove_outlier==1:
        df = df[(df[x]>ll) & (df[x]<ul)]
        df.shape
        return df

df = computer_remove_outliers(df, 'trip_distance')
df = df[df['trip_distance'] > 0]

lower limit of dist is -2.1049999999999995 and upper limit of dist is 6.174999999999999


In [None]:
# Update datatype of datetime columns

df['time_diff_hours'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).values.astype('timedelta64[h]')
df['time_diff_hours'] = df['time_diff_hours'].astype(int)

In [8]:
# identify outliers in date columns - trips more than 2 hours could be outliers and to be removed

df['time_diff_hours'].value_counts().sort_values(ascending=False)
df = df[df["time_diff_hours"] < 2]

In [None]:
# remove cancelled rides

df['time_diff_min'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).values.astype('timedelta64[m]').astype(int)
df[df['time_diff_min']==0]
df = df[df["time_diff_min"] > 0]  

In [9]:
# remove uncommon passenger counts

df["passenger_count"].value_counts(normalize=True).sort_values(ascending=False) * 100
df = df[(df["passenger_count"] <=2) & (df["passenger_count"] >0)]

(1850959, 22)

In [10]:
# Remove outliers from cost / amount related columns

df = df[(df['total_amount'] > 0) & (df['total_amount'] < 500)]
df = df[df['extra'] >= 0]
computer_remove_outliers(df, 'total_amount', 0)
df = df[df['total_amount'] < 200]

lower limit of dist is -1.8000000000000007 and upper limit of dist is 24.76


(1850598, 22)

In [11]:
# Create clusters based on location

from sklearn.cluster import KMeans

df_location = df[['pickup_latitude', 'pickup_longitude']]
kmeans = KMeans(n_clusters = 10) 
df_location['cluster_label'] = kmeans.fit_predict(df_location)  # df_location has lat and long in a DF.
df_location['cluster_label'].value_counts().sort_values(ascending=False)  # check count per cluster

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_location['cluster_label'] = kmeans.fit_predict(df_location)  # df_location has lat and long in a DF.


cluster_label
1    419711
9    359259
2    284075
3    257172
6    201107
4    184058
7     87519
8     36611
0     18638
5      2448
Name: count, dtype: int64

In [12]:
# Merge location cluster with main dataframe

df_with_loc_clusters = pd.merge(df_location, df, on=['pickup_latitude', 'pickup_longitude'])
df_with_loc_clusters['pickup_hour'] = df_with_loc_clusters["tpep_pickup_datetime"].dt.hour
df_with_loc_clusters['pickup_dayofweek'] = df_with_loc_clusters["tpep_pickup_datetime"].dt.dayofweek

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
pickup_dayofweek,1679.0,3.0,2.001192,0.0,1.0,3.0,5.0,6.0
pickup_hour,1679.0,11.504467,6.923889,0.0,6.0,12.0,17.5,23.0
cluster_label,1679.0,4.499702,2.873967,0.0,2.0,4.0,7.0,9.0
demand,1679.0,21.169839,20.995945,0.02,2.585,13.03,35.52,100.0


In [None]:
# Create demand column which is target column for the demand prediction model

df_model = df_with_loc_clusters.groupby(['pickup_dayofweek','pickup_hour', 'cluster_label']).size().reset_index()
df_model = df_model.rename(columns={0:'demand'})
df_model['demand'] = df_model['demand']/df_model['demand'].max()
df_model['demand'] = round((df_model['demand'] * 100), 2)

In [13]:
# Model preprocessing

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor

X = df_model.drop(["demand"],axis=1)
y = df_model[['demand']]
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.8, random_state=100)

cat_columns = ['pickup_dayofweek', 'pickup_hour', 'cluster_label']
enc = OneHotEncoder()
enc.fit(X_train[cat_columns])

X_train_encoded = enc.transform(X_train[cat_columns]).toarray()
X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=enc.get_feature_names_out(cat_columns))
X_train_model = pd.concat([X_train.reset_index(), X_train_encoded_df], axis=1).drop(['index'], axis=1)

X_test_encoded = enc.transform(X_test[cat_columns]).toarray()
X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=enc.get_feature_names_out(cat_columns))
X_test_model = pd.concat([X_test.reset_index(), X_test_encoded_df], axis=1).drop(['index'], axis=1)

X_train_model = X_train_model.drop(cat_columns, axis=1)
X_test_model = X_test_model.drop(cat_columns, axis=1)

X_train_model[X_train_model.columns] = X_train_model[X_train_model.columns].astype(int)
X_test_model[X_test_model.columns] = X_test_model[X_test_model.columns].astype(int)

y_train_model = y_train.reset_index().drop(['index'], axis=1)
y_test_model = y_test.reset_index().drop(['index'], axis=1)

y_train_model = y_train_model['demand']
y_test_model = y_test_model['demand']

In [18]:
# Hpyerparameter tuning for Random Forest algorithm

from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error

rf_model = RandomForestRegressor()
param_dist = { 
    'n_estimators': [200, 500, 1000],  # trees in forest
    'min_samples_split': [2, 5, 7],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8]  
}
grid_search = RandomizedSearchCV(rf_model, param_distributions=param_dist, cv = 3, 
                                   verbose=1, n_jobs=-1)
grid_search.fit(X_train_model, y_train_model)
print(grid_search.best_estimator_)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
RandomForestRegressor(max_depth=8, max_features='sqrt', n_estimators=200)


In [19]:
# Training random forest using hyper parameters selected using randomized search CV.

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor

params = {"max_depth": 8, "min_samples_split": 7, "n_estimators": 1000}
model_rf = RandomForestRegressor(**params)
model_rf.fit(X_train_model , y_train_model) 

prediction_train = model_rf.predict(X_train_model)
mse = mean_squared_error(prediction_train, y_train_model)
r2 = model_rf.score(X_train_model,y_train_model)
print('{}: MSE is {} and R2 is {}'.format('random forest on Train', round(mse,2), round(r2,2)))

prediction = model_rf.predict(X_test_model)
mse = mean_squared_error(prediction, y_test_model)
r2 = model_rf.score(X_test_model,y_test_model)
print('{}: MSE is {} and R2 is {}'.format('random forest on Test', round(mse,2), round(r2,2)))

random forest on Train: MSE is 74.34 and R2 is 0.83


random forest on Test: MSE is 87.39 and R2 is 0.8




In [20]:
# Save the model and its artifacts

import joblib

joblib.dump(kmeans, "../model_artifacts/kmeans.joblib")
joblib.dump(enc, "../model_artifacts/enc.bin", compress=True)
joblib.dump(model_rf, "../model_artifacts/taxi_demand_prediction.joblib")

['../model_artifacts/taxi_demand_prediction.joblib']

In [21]:
# Register model

from azureml.core.model import Model
import urllib.request

model = Model.register(workspace, model_name="taxi_demand_prediction", model_path="../model_artifacts/") 

Registering model taxi_demand_prediction


In [22]:
# Environment setup

from azureml.core import Environment
from azureml.core.model import InferenceConfig

env = Environment(name="taxi_demand_prediction")

python_packages = ['azure-ml-api-sdk','numpy', 'pandas', 'seaborn', 'matplotlib', 'scipy', 'scikit-learn', 'joblib','requests']
for package in python_packages:
    env.python.conda_dependencies.add_pip_package(package)

In [None]:
# Inference configuration setup

inference_config = InferenceConfig(
    environment=env,
    source_directory="../source_dir",
    entry_script="echo_score.py",
)

In [27]:
# Local Deployment

from azureml.core.webservice import LocalWebservice
from azureml.core.webservice import AciWebservice

deployment_config = LocalWebservice.deploy_configuration(port=6789)

service = Model.deploy(
    workspace,
    "taxidemandprediction",
    [model],
    inference_config,
    deployment_config,
    overwrite=True,
)
service.wait_for_deployment(show_output=True)

print(service.get_logs())

In [46]:
# Local Testing

import requests
import json

uri = service.scoring_uri
requests.get("http://localhost:6789")
headers = {"Content-Type": "application/json"}
data = {"pickup_latitude":"-73.980492","pickup_longitude":"40.777981","tpep_pickup_datetime":"2020-02-13 23:40:00"}
data = json.dumps(data)
response = requests.post(uri, data=data, headers=headers)
print(response.json())

demand percent is 61.33


In [1]:
# Local Testing Logs

service.get_logs()

In [47]:
# Remote Deployment

deployment_config = AciWebservice.deploy_configuration(
    cpu_cores=0.5, memory_gb=1, auth_enabled=True
)

service = Model.deploy(
    workspace,
    "taxidemandprediction",
    [model],
    inference_config,
    deployment_config,
    overwrite=True,
)
service.wait_for_deployment(show_output=True)

print(service.get_logs())

In [50]:
# Testing Remote Test Calls

import requests
import json
from azureml.core import Webservice

service = Webservice(workspace=workspace, name="taxidemandprediction")
scoring_uri = service.scoring_uri

# If the service is authenticated, set the key or token
key, _ = service.get_keys()

# Set the appropriate headers
headers = {"Content-Type": "application/json"}
headers["Authorization"] = f"Bearer {key}"

# Make the request and display the response and logs
data = {"pickup_latitude":"-73.980492","pickup_longitude":"40.777981","tpep_pickup_datetime":"2020-02-13 23:40:00"}

data = json.dumps(data)
resp = requests.post(scoring_uri, data=data, headers=headers)
print(resp.text)

"demand percent is 61.33"
