In [1]:
# @hidden_cell
# The project token is an authorization token that is used to access project resources like data sources, connections, and used by platform APIs.
from projectnb import ProjectContext, ProjectUtil
pc = ProjectContext.ProjectContext(sc, 'd10b8cd6-7023-414e-8a9f-39d168e46181', 'p-18797feb3787c5fcfe7f09cc1935c08ab41c7cc1')

In [3]:
# Access 'rbweather_filtered.csv' data file from the project.
df = ProjectUtil.load_dataframe_from_file(pc, "rbweather_filtered.csv")
df.take(2)

[Row(name=u'C.Newton', playerid=u'00-0027939', position=u'QB', rushing_att=7, rushing_yds=43, rushing_tds=0, receiving_yds=0, home=True, teamid=u'CAR', team=u'Carolina Panthers', teamloc=u'Carolina', stadium_city=u'Bank of America Stadium', capacity=u'75,419', surface=u'Voyager Bermuda Grass', roof=u'Open', home_score=30, home_team=u'CAR', away_score=20, away_team=u'ARI', gameid=2016103009, week=8, year=2016, month=10, day=30, time=u'4:25', ampm=u'PM', weather=u'80,80,8,None,80,55,29.31,10', temp=80, hi=80, wspd=8, precip_total=None, feels_like=80, dewPt=55, pressure=29, vis=10),
 Row(name=u'J.Stewart', playerid=u'00-0026153', position=u'RB', rushing_att=25, rushing_yds=95, rushing_tds=2, receiving_yds=25, home=True, teamid=u'CAR', team=u'Carolina Panthers', teamloc=u'Carolina', stadium_city=u'Bank of America Stadium', capacity=u'75,419', surface=u'Voyager Bermuda Grass', roof=u'Open', home_score=30, home_team=u'CAR', away_score=20, away_team=u'ARI', gameid=2016103009, week=8, year=201

In [4]:
df.count()

4435

In [5]:
df = df.where("not temp is Null")

In [6]:
df = df.withColumn('ishome', df['home'].cast('integer'))

In [7]:
df.groupBy("position").count().where('count < 10').createOrReplaceTempView("lowPosCount")

In [8]:
df.groupBy("name").count().where('count < 10').createOrReplaceTempView("lowNameCount")

In [9]:
df.createOrReplaceTempView("allRows")

In [16]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
morePlayers = spark.sql("""select * from allRows where 
                                            name not in (select name from lowNameCount) and
                                            position not in (select position from lowPosCount)""")

In [None]:
#features temp wspd roof surface home stadium_city away_team name position teamid
#label rushing_yds

In [20]:
from pyspark.ml.feature import StringIndexer

nameInd = StringIndexer(inputCol="name", outputCol="nameInd")
posInd = StringIndexer(inputCol="position", outputCol="posInd")
roofInd = StringIndexer(inputCol="roof", outputCol="roofInd")
surfInd = StringIndexer(inputCol="surface", outputCol="surfInd")
stadInd = StringIndexer(inputCol="stadium_city", outputCol="stadInd")
teamInd = StringIndexer(inputCol="teamid", outputCol="teamInd")
oppInd = StringIndexer(inputCol="away_team", outputCol="oppInd")

In [21]:
from pyspark.ml.feature import OneHotEncoder

nameEnc = OneHotEncoder(inputCol="nameInd", outputCol="nameEnc")
posEnc = OneHotEncoder(inputCol="posInd", outputCol="posEnc")
roofEnc = OneHotEncoder(inputCol="roofInd", outputCol="roofEnc")
surfEnc = OneHotEncoder(inputCol="surfInd", outputCol="surfEnc")
stadEnc = OneHotEncoder(inputCol="stadInd", outputCol="stadEnc")
teamEnc = OneHotEncoder(inputCol="teamInd", outputCol="teamEnc")
oppEnc = OneHotEncoder(inputCol="oppInd", outputCol="oppEnc")

In [17]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

vecAss = VectorAssembler(
    inputCols=["nameEnc", "posEnc", "roofEnc", "surfEnc", "stadEnc", "temp", "wspd", "ishome", "teamEnc", "oppEnc"],
    outputCol="features")

In [18]:
from pyspark.ml.regression import RandomForestRegressor

rf = RandomForestRegressor(featuresCol="features",labelCol="rushing_yds")

In [22]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[nameInd,nameEnc,posInd,posEnc,roofInd,roofEnc,surfInd,surfEnc,
                           stadInd,stadEnc,teamInd,teamEnc,oppInd,oppEnc,vecAss,rf])

In [17]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
import datetime

paramGrid = ParamGridBuilder() \
    .addGrid(rf.maxBins, [40,60,80])\
    .addGrid(rf.maxDepth, [16,20,25])\
    .addGrid(rf.numTrees, [80,100,120])\
    .build()
    
evaluator = RegressionEvaluator(labelCol="rushing_yds", predictionCol="prediction", metricName="rmse")

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=5)

print datetime.datetime.now()
cvModel = crossval.fit(morePlayers)
print datetime.datetime.now()

2017-08-30 16:27:50.974257
2017-08-30 18:32:24.184197


In [18]:
rmse = evaluator.evaluate(cvModel.bestModel.transform(morePlayers))
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)
#28.8953

Root Mean Squared Error (RMSE) on test data = 22.527


In [19]:
print cvModel.bestModel.stages[-1]._java_obj.getMaxBins()#60!
print cvModel.bestModel.stages[-1]._java_obj.getMaxDepth()#25-->
print cvModel.bestModel.stages[-1]._java_obj.getNumTrees()#<--80

60
25
80


In [23]:
#features temp wspd roof surface home stadium_city away_team name position teamid
#label rushing_yds
df = df.select('temp','wspd','roof','surface','ishome','stadium_city','away_team','name','position','teamid','rushing_yds')

In [66]:
df.printSchema()

root
 |-- temp: integer (nullable = true)
 |-- wspd: integer (nullable = true)
 |-- roof: string (nullable = true)
 |-- surface: string (nullable = true)
 |-- ishome: integer (nullable = true)
 |-- stadium_city: string (nullable = true)
 |-- away_team: string (nullable = true)
 |-- name: string (nullable = true)
 |-- position: string (nullable = true)
 |-- teamid: string (nullable = true)
 |-- rushing_yds: integer (nullable = true)



In [24]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml import Pipeline

rf = RandomForestRegressor(featuresCol="features",labelCol="rushing_yds",maxBins=60,maxDepth=25,numTrees=80)
pipeline = Pipeline(stages=[nameInd,nameEnc,posInd,posEnc,roofInd,roofEnc,surfInd,surfEnc,
                           stadInd,stadEnc,teamInd,teamEnc,oppInd,oppEnc,vecAss,rf])

model = pipeline.fit(df)

In [25]:
predictions = model.transform(df)
predictions.select("rushing_yds","prediction").take(10)

[Row(rushing_yds=43, prediction=35.29881331699347),
 Row(rushing_yds=95, prediction=73.93927505216902),
 Row(rushing_yds=2, prediction=5.734204333043673),
 Row(rushing_yds=1, prediction=29.297084212238115),
 Row(rushing_yds=43, prediction=34.98976840357355),
 Row(rushing_yds=46, prediction=49.27254255597158),
 Row(rushing_yds=37, prediction=32.93142857142857),
 Row(rushing_yds=9, prediction=30.32950617310642),
 Row(rushing_yds=5, prediction=6.005865968084333),
 Row(rushing_yds=25, prediction=14.34627268905398)]

In [29]:
from repository.mlrepositoryclient import MLRepositoryClient
from repository.mlrepositoryartifact import MLRepositoryArtifact

ml_repository_client = MLRepositoryClient()
model_artifact = MLRepositoryArtifact(model, training_data=df, name="Rushing Yards Prediction Model")

saved_model = ml_repository_client.models.save(model_artifact)

In [30]:
print saved_model.meta.available_props()
print
print "modelType: " + saved_model.meta.prop("modelType")
print "trainingDataSchema: " + str(saved_model.meta.prop("trainingDataSchema"))
print "creationTime: " + str(saved_model.meta.prop("creationTime"))
print "modelVersionHref: " + saved_model.meta.prop("modelVersionHref")
print "label: " + saved_model.meta.prop("label")

['inputDataSchema', 'evaluationMetrics', 'pipelineVersionHref', 'modelVersionHref', 'trainingDataRef', 'pipelineType', 'creationTime', 'lastUpdated', 'label', 'authorEmail', 'trainingDataSchema', 'authorName', 'version', 'modelType', 'runtime', 'evaluationMethod']

modelType: sparkml-model-2.0
trainingDataSchema: {u'fields': [{u'nullable': True, u'type': u'integer', u'name': u'temp', u'metadata': {}}, {u'nullable': True, u'type': u'integer', u'name': u'wspd', u'metadata': {}}, {u'nullable': True, u'type': u'string', u'name': u'roof', u'metadata': {}}, {u'nullable': True, u'type': u'string', u'name': u'surface', u'metadata': {}}, {u'nullable': True, u'type': u'integer', u'name': u'ishome', u'metadata': {}}, {u'nullable': True, u'type': u'string', u'name': u'stadium_city', u'metadata': {}}, {u'nullable': True, u'type': u'string', u'name': u'away_team', u'metadata': {}}, {u'nullable': True, u'type': u'string', u'name': u'name', u'metadata': {}}, {u'nullable': True, u'type': u'string', u'n

In [31]:
loadedModelArtifact = ml_repository_client.models.get(saved_model.uid)

print str(loadedModelArtifact.name)

Rushing Yards Prediction Model


In [47]:
service_path = "https://internal-nginx-svc.ibm-private-cloud.svc.cluster.local:12443"
online_path = service_path + "/v2/deployments"

response = requests.get(online_path)

print (response)
print response.text

<Response [200]>
{"resources":[{"metadata":{"guid":"79a22c1b-8d60-4e6a-bd4b-59a760f8716d","href":"https://internal-nginx-svc.ibm-private-cloud.svc.cluster.local:12443/v2/deployments/79a22c1b-8d60-4e6a-bd4b-59a760f8716d","createdAt":"2017-09-06T20:46:46.155Z"},"entity":{"name":"Rushing","artifactVersion":{"guid":"e4379816-c9b3-4ea8-8d00-67418ca9e896","href":"https://internal-nginx-svc.ibm-private-cloud.svc.cluster.local:12443/v2/artifacts/models/164e74d7-a829-40a0-9791-333cbd8e59f6/versions/e4379816-c9b3-4ea8-8d00-67418ca9e896"},"predictionEndpoints":{"online":"https://internal-nginx-svc.ibm-private-cloud.svc.cluster.local:12443/v2/scoring/online/79a22c1b-8d60-4e6a-bd4b-59a760f8716d","stream":"https://internal-nginx-svc.ibm-private-cloud.svc.cluster.local:12443/v2/deployments/79a22c1b-8d60-4e6a-bd4b-59a760f8716d/streams","batch":"https://internal-nginx-svc.ibm-private-cloud.svc.cluster.local:12443/v2/deployments/79a22c1b-8d60-4e6a-bd4b-59a760f8716d/batch_jobs"},"modelType":"sparkml-mode

In [80]:
import requests

#scoring_url = 'https://internal-nginx-svc.ibm-private-cloud.svc.cluster.local:12443/v2/artifacts/models/164e74d7-a829-40a0-9791-333cbd8e59f6/versions/e4379816-c9b3-4ea8-8d00-67418ca9e896'
#scoring_url = 'https://169.60.7.74:12443/v2/artifacts/models/164e74d7-a829-40a0-9791-333cbd8e59f6/versions/e4379816-c9b3-4ea8-8d00-67418ca9e896'
#scoring_url = 'https://internal-nginx-svc.ibm-private-cloud.svc.cluster.local:12443/v2/scoring/online/79a22c1b-8d60-4e6a-bd4b-59a760f8716d'
#scoring_url = 'https://169.60.7.74:12443/v2/identity/token/scoring/online/79a22c1b-8d60-4e6a-bd4b-59a760f8716d'
service_path = "https://internal-nginx-svc.ibm-private-cloud.svc.cluster.local:12443"
scoring_url = service_path + "/v2/deployments"

#payload_scoring = {"fields": ['temp','wspd','roof','surface','home','stadium_city','away_team','name','position','teamid'],
#                    "values": [[60,10,"Open","Voyager Bermuda Grass",True,'Bank of America Stadium','ARI','J.Stewart','QB','CAR']]}
payload_scoring = {'fields': ['temp','wspd','roof','surface','ishome','stadium_city','away_team','name','position','teamid'],
                    'records': [[60,10,'Open','Voyager Bermuda Grass',1,'Bank of America Stadium','ARI','J.Stewart','QB','CAR']]}


import urllib3, requests, json

#service_path = 'https://internal-nginx-svc.ibm-private-cloud.svc.cluster.local:12443'
#instance_id = 'e43f71e5-f451-4b26-8476-2c42e336d67e'
#username = '7efdfbec-da70-43cf-b4ce-3332be17d48d'
#password = '37e5c543-837f-4f90-a854-1a388b62b27d'

#headers = urllib3.util.make_headers(basic_auth='{}:{}'.format("username", "password"))
#url = '{}/v2/identity/token'.format(service_path)

#response = requests.get(scoring_url, headers=headers)
response = requests.get(scoring_url)
print response.text
#mltoken = json.loads(response.text).get('token')

#print mltoken
header = {'Content-Type': 'application/json',"name":"Rushing"}#, 'Authorization': 'Bearer ' + mltoken}

response_scoring = requests.post(scoring_url, json=payload_scoring, headers=header)

print response_scoring.text

{"resources":[{"metadata":{"guid":"79a22c1b-8d60-4e6a-bd4b-59a760f8716d","href":"https://internal-nginx-svc.ibm-private-cloud.svc.cluster.local:12443/v2/deployments/79a22c1b-8d60-4e6a-bd4b-59a760f8716d","createdAt":"2017-09-06T20:46:46.155Z"},"entity":{"name":"Rushing","artifactVersion":{"guid":"e4379816-c9b3-4ea8-8d00-67418ca9e896","href":"https://internal-nginx-svc.ibm-private-cloud.svc.cluster.local:12443/v2/artifacts/models/164e74d7-a829-40a0-9791-333cbd8e59f6/versions/e4379816-c9b3-4ea8-8d00-67418ca9e896"},"predictionEndpoints":{"online":"https://internal-nginx-svc.ibm-private-cloud.svc.cluster.local:12443/v2/scoring/online/79a22c1b-8d60-4e6a-bd4b-59a760f8716d","stream":"https://internal-nginx-svc.ibm-private-cloud.svc.cluster.local:12443/v2/deployments/79a22c1b-8d60-4e6a-bd4b-59a760f8716d/streams","batch":"https://internal-nginx-svc.ibm-private-cloud.svc.cluster.local:12443/v2/deployments/79a22c1b-8d60-4e6a-bd4b-59a760f8716d/batch_jobs"},"modelType":"sparkml-model-2.0","runtimeEn

In [82]:
scoring_href = 'https://internal-nginx-svc.ibm-private-cloud.svc.cluster.local:12443/v2/scoring/online/79a22c1b-8d60-4e6a-bd4b-59a760f8716d'
header = {'Content-Type': 'application/json',"name":"Rushing"}
new = json.dumps(payload_scoring)
res = requests.post(scoring_href,headers=header,json=payload_scoring)

print res.text

{
  "fields": ["temp", "wspd", "roof", "surface", "ishome", "stadium_city", "away_team", "name", "position", "teamid", "nameInd", "nameEnc", "posInd", "posEnc", "roofInd", "roofEnc", "surfInd", "surfEnc", "stadInd", "stadEnc", "teamInd", "teamEnc", "oppInd", "oppEnc", "features", "prediction"],
  "records": [[60, 10, "Open", "Voyager Bermuda Grass", 1, "Bank of America Stadium", "ARI", "J.Stewart", "QB", "CAR", 22.0, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0