In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import math 

In [2]:
print(pd.__version__)

1.5.1


In [3]:
from azureml.core import Workspace


ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep='\n')

mlads-3
rg-mlads-3
eastus
d83b98a9-eaa6-475f-9ae6-1ef35394a1e5


In [4]:
model_metrics = pd.DataFrame(columns=["Model", "MSE", "RMSE", "R2"], dtype=object)


def metrics(model, y, y_hat):

    model_metrics.loc[-1] = {"Model" : model, 
              "MSE" : mean_squared_error(y, y_hat),
              "RMSE" : math.sqrt(mean_squared_error(y, y_hat)),
              "R2" : r2_score(y, y_hat)}

    model_metrics.index = model_metrics.index + 1
    return model_metrics 

#metrics("PCA Forest",y_test, epa_pca_y_pred)

In [5]:
epa = pd.read_csv('https://raw.githubusercontent.com/sqlshep/SQLShepBlog/master/data/epaMpg.csv')

In [6]:
epa.head(10)

Unnamed: 0,RowNumber,Represented.Test.Veh.Make,Model,Vehicle.Type,HorsePower,Cylinders,Tested.Transmission.Type.Code,Tested.Transmission.Type,Gears,Drive.System.Code,Weight,AxleRatio,Test.Procedure.Cd,Test.Procedure.Description,Test.Fuel.Type.Cd,Test.Fuel.Type.Description,FuelEcon
0,1,Aston Martin,Rapide S,Car,552,12,SA,Semi-Automatic,8,R,4750,2.73,21,Federal fuel 2-day exhaust (w/can load),61,Tier 2 Cert Gasoline,17.3
1,2,Aston Martin,Vanquish,Car,568,12,SA,Semi-Automatic,8,R,4500,2.73,21,Federal fuel 2-day exhaust (w/can load),61,Tier 2 Cert Gasoline,16.5
2,3,BENTLEY,Continental GT,Car,616,12,SA,Semi-Automatic,8,F,6000,2.85,90,US06,61,Tier 2 Cert Gasoline,17.4
3,4,BENTLEY,Continental GT,Car,616,12,SA,Semi-Automatic,8,F,6000,2.85,11,Cold CO,27,Cold CO Premium (Tier 2),13.6
4,5,BMW,230i Convertible,Car,248,4,SA,Semi-Automatic,8,R,4000,2.81,3,HWFE,61,Tier 2 Cert Gasoline,45.8
5,6,BMW,230i Coupe,Car,248,4,M,Manual,6,R,3625,3.91,21,Federal fuel 2-day exhaust (w/can load),61,Tier 2 Cert Gasoline,26.4
6,7,BMW,230i Coupe,Car,248,4,SA,Semi-Automatic,8,R,3625,2.81,3,HWFE,61,Tier 2 Cert Gasoline,50.6
7,8,BMW,230i xDrive Convertible,Car,248,4,SA,Semi-Automatic,8,R,4000,2.81,31,Federal fuel 3-day exhaust,61,Tier 2 Cert Gasoline,29.6
8,9,BMW,230i xDrive Coupe,Car,248,4,SA,Semi-Automatic,8,R,3750,2.81,21,Federal fuel 2-day exhaust (w/can load),61,Tier 2 Cert Gasoline,30.3
9,10,BMW,320i,Both,181,4,A,Automatic,8,R,3625,3.2,31,Federal fuel 3-day exhaust,61,Tier 2 Cert Gasoline,30.4


In [7]:
epa.shape

(1034, 17)

In [8]:
#Drop the row number
epa = epa.drop(epa.columns[[0]], axis=1)

#replace the "." in the column names with "_"
epa.columns = epa.columns.str.replace('.', '_')

# Drop the first three columns
epa = epa.drop(epa.columns[[0,1,2]], axis=1)

# drop descrition columns
epa = epa.drop(epa.columns[[3,9,11]], axis=1)
epa


  epa.columns = epa.columns.str.replace('.', '_')


Unnamed: 0,HorsePower,Cylinders,Tested_Transmission_Type_Code,Gears,Drive_System_Code,Weight,AxleRatio,Test_Procedure_Cd,Test_Fuel_Type_Cd,FuelEcon
0,552,12,SA,8,R,4750,2.73,21,61,17.3
1,568,12,SA,8,R,4500,2.73,21,61,16.5
2,616,12,SA,8,F,6000,2.85,90,61,17.4
3,616,12,SA,8,F,6000,2.85,11,27,13.6
4,248,4,SA,8,R,4000,2.81,3,61,45.8
...,...,...,...,...,...,...,...,...,...,...
1029,313,4,SA,8,A,5000,3.33,21,61,33.1
1030,250,4,SA,8,F,5000,3.33,31,61,27.4
1031,250,4,SA,8,F,4750,3.33,31,61,28.6
1032,316,4,SA,8,F,5000,3.33,3,61,37.9


In [9]:
epa['Tested_Transmission_Type_Code']= epa['Tested_Transmission_Type_Code'].astype('category')
    
epa['Drive_System_Code']= epa['Drive_System_Code'].astype('category')

In [10]:
#One hot encode categories
epa = pd.get_dummies(epa)

In [11]:
# Create the training dataset for scikit learn, you will need all varialbes except the labe you are trying to predict
epa_X = epa.iloc[:, epa.columns !='FuelEcon']
#epa_X = epa.iloc[:, epa.columns =='Weight']
epa_X



Unnamed: 0,HorsePower,Cylinders,Gears,Weight,AxleRatio,Test_Procedure_Cd,Test_Fuel_Type_Cd,Tested_Transmission_Type_Code_A,Tested_Transmission_Type_Code_AM,Tested_Transmission_Type_Code_AMS,Tested_Transmission_Type_Code_CVT,Tested_Transmission_Type_Code_M,Tested_Transmission_Type_Code_SA,Tested_Transmission_Type_Code_SCV,Drive_System_Code_4,Drive_System_Code_A,Drive_System_Code_F,Drive_System_Code_P,Drive_System_Code_R
0,552,12,8,4750,2.73,21,61,0,0,0,0,0,1,0,0,0,0,0,1
1,568,12,8,4500,2.73,21,61,0,0,0,0,0,1,0,0,0,0,0,1
2,616,12,8,6000,2.85,90,61,0,0,0,0,0,1,0,0,0,1,0,0
3,616,12,8,6000,2.85,11,27,0,0,0,0,0,1,0,0,0,1,0,0
4,248,4,8,4000,2.81,3,61,0,0,0,0,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1029,313,4,8,5000,3.33,21,61,0,0,0,0,0,1,0,0,1,0,0,0
1030,250,4,8,5000,3.33,31,61,0,0,0,0,0,1,0,0,0,1,0,0
1031,250,4,8,4750,3.33,31,61,0,0,0,0,0,1,0,0,0,1,0,0
1032,316,4,8,5000,3.33,3,61,0,0,0,0,0,1,0,0,0,1,0,0


In [12]:
epa_y = epa.iloc[:, epa.columns =='FuelEcon']

In [13]:
# Split the training and test set 
X_train, X_test, y_train, y_test = train_test_split(epa_X, epa_y, test_size=0.20)

In [14]:
epa_forest = RandomForestRegressor()
epa_forest.fit(X_train, y_train)

  epa_forest.fit(X_train, y_train)


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [15]:
forest_y_hat = epa_forest.predict(X_test)

In [16]:
metrics("RandomForestRegressor",y_test, forest_y_hat)

Unnamed: 0,Model,MSE,RMSE,R2
0,RandomForestRegressor,14.56965,3.817021,0.82704


In [17]:
joblib.dump(epa_forest, 'EPA_sklearn_regression_model.pkl')

['EPA_sklearn_regression_model.pkl']

In [None]:
import numpy as np

from azureml.core import Dataset


np.savetxt('EPA_features.csv', epa_X, delimiter=',')
np.savetxt('EPA_labels.csv', epa_y, delimiter=',')

datastore = ws.get_default_datastore()
datastore.upload_files(files=['./EPA_features.csv', './EPA_labels.csv'],
                       target_path='EPA_sklearn_regression/',
                       overwrite=True)

input_dataset = Dataset.Tabular.from_delimited_files(path=[(datastore, 'EPA_sklearn_regression/EPA_features.csv')])
output_dataset = Dataset.Tabular.from_delimited_files(path=[(datastore, 'EPA_sklearn_regression/EPA_labels.csv')])

"datastore.upload_files" is deprecated after version 1.0.69. Please use "FileDatasetFactory.upload_directory" instead. See Dataset API change notice at https://aka.ms/dataset-deprecation.


Uploading an estimated of 2 files
Uploading ./EPA_features.csv
Uploaded ./EPA_features.csv, 1 files out of an estimated total of 2
Uploading ./EPA_labels.csv
Uploaded ./EPA_labels.csv, 2 files out of an estimated total of 2
Uploaded 2 files


In [None]:
import sklearn

from azureml.core import Model
from azureml.core.resource_configuration import ResourceConfiguration


model = Model.register(workspace=ws,
                       model_name='EPA-sklearn-model-R-Forest',                # Name of the registered model in your workspace.
                       model_path='./EPA_sklearn_regression_model.pkl',  # Local file to upload and register as a model.
                       model_framework=Model.Framework.SCIKITLEARN,  # Framework used to create the model.
                       model_framework_version=sklearn.__version__,  # Version of scikit-learn used to create the model.
                       sample_input_dataset=input_dataset,
                       sample_output_dataset=output_dataset,
                       resource_configuration=ResourceConfiguration(cpu=1, memory_in_gb=0.5),
                       description='SKLearn Random Forest to predict EPA MPG.',
                       tags={'area': 'EPA', 'type': 'regression'})

print('Name:', model.name)
print('Version:', model.version)

In [None]:
service_name = 'epa-sklearn-forest-service'

service = Model.deploy(ws, service_name, [model], overwrite=True)
service.wait_for_deployment(show_output=True)

In [None]:
X_test[0:1].to_numpy().tolist()

In [None]:
import json

input_payload = json.dumps({
    'data': X_test[64:65].to_numpy().tolist(),
    'method': 'predict'  # If you have a classification model, you can get probabilities by changing this to 'predict_proba'.
})

output = service.run(input_payload)

print(output)

In [None]:
#service.delete()