<img src="http://milepro.com/wp-content/uploads/2014/01/Travel-Credit-Cards-1024x606.jpg" style="width:200px; float: left; padding-right: 10px"/>
<h2 style="font-face: verdana; font-size: 32px;">Predict credit card customer churn<br>with IBM Watson Machine Learning</h2>
<h3 style="font-face: verdana; font-size: 16px;">Part 2: Deploy Churn Model</h3>




### 1. Load the data into a dataframe ##
-------------------------------------
<p>In this section you will load the data as an Apache® Spark DataFrame and perform a basic exploration.</p>
<p>Load the data to the Spark DataFrame by using wget to upload the data to gpfs and then read method.</p>

In [2]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.regression import LinearRegression
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.util import MLUtils

<p>The project token is an authorization token that is used to access project resources like data sources, connections, and used by platform APIs.</p>

In [3]:
# @hidden_cell
from projectnb import ProjectContext, ProjectUtil
pc = ProjectContext.ProjectContext(sc, "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx", "p-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")

<div class="alert alert-block alert-info"> Note: When creating the project context, use the local spark context (scl) created above instead of the default spark context (sc).</div> 
Generate a token using the Insert Token option (click vertical ellipses button), hten replace the credentials in the cell above)



### 1.1 Load TEST_SUM.csv from IBM Bluemix Object Store ###

In [4]:
from io import StringIO
import requests
import json
import pandas as pd
import numpy as np

In [5]:
import ibmos2spark

# @hidden_cell
credentials = {
    'auth_url': 'https://identity.open.softlayer.com',
    'project_id': 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx',
    'region': 'dallas',
    'user_id': 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx',
    'username': 'member_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx',
    'password': 'xxxxxxxxxxxxxxxx'
}

configuration_name = 'os_22d1583b68424f2e9e8cbd924062b554_configs'
bmos = ibmos2spark.bluemix(sc, credentials, configuration_name)

from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
df = spark.read\
  .format('org.apache.spark.sql.execution.datasources.csv.CSVFileFormat')\
  .option('header', 'true').option('inferSchema','true')\
  .load(bmos.url('BankingChurn', 'TEST_SUM.csv'))
df.take(5)

[Row(CUST_ID=1009530860, SEX=u'F', AGE=84, EDUCATION=2, INVESTMENT=114368, INCOME=3852862, ACTIVITY=5, CHURN=0, YRLY_AMT=700259.0, AVG_DAILY_TX=0.917808, YRLY_TX=335, AVG_TX_AMT=2090.32, NEGTWEETS=3, STATE=u'TX', EDUCATION_GROUP=u'Bachelors degree', TwitterID=0, CHURN_LABEL=False, INSERT_TIME=datetime.datetime(2017, 2, 9, 11, 0)),
 Row(CUST_ID=1009544000, SEX=u'F', AGE=44, EDUCATION=2, INVESTMENT=90298, INCOME=3849843, ACTIVITY=1, CHURN=0, YRLY_AMT=726977.0, AVG_DAILY_TX=0.950685, YRLY_TX=347, AVG_TX_AMT=2095.04, NEGTWEETS=10, STATE=u'CA', EDUCATION_GROUP=u'Bachelors degree', TwitterID=0, CHURN_LABEL=False, INSERT_TIME=datetime.datetime(2017, 2, 9, 11, 0)),
 Row(CUST_ID=1009534260, SEX=u'F', AGE=23, EDUCATION=2, INVESTMENT=94881, INCOME=3217364, ACTIVITY=1, CHURN=1, YRLY_AMT=579084.0, AVG_DAILY_TX=0.920548, YRLY_TX=336, AVG_TX_AMT=1723.46, NEGTWEETS=5, STATE=u'CA', EDUCATION_GROUP=u'Bachelors degree', TwitterID=0, CHURN_LABEL=True, INSERT_TIME=datetime.datetime(2017, 2, 9, 11, 0)),
 Ro



### 1.2 Select Churn Data for the Model ###
<p>Select AGE, ACTIVITY, EDUCATION, SEX, STATE, NEGTWEETS, INCOME, CHURN from the churnDataRaw dataframe.</p>

In [6]:
churnData = df.select('AGE', 'ACTIVITY', 'EDUCATION', 'SEX', 'STATE', 'NEGTWEETS', 'INCOME', 'CHURN')
churnData.head()

Row(AGE=84, ACTIVITY=5, EDUCATION=2, SEX=u'F', STATE=u'TX', NEGTWEETS=3, INCOME=3852862, CHURN=0)




## 2. Create an Apache Spark machine learning model ##
-------------------------------------
<p>Prepare data, create an Apache Spark machine learning pipeline, and train a model.</p>




### 2.1 Prepare the Data ###
<p>In this subsection you will split your data into: train, test and predict datasets.</p>

In [7]:
trainDF, validateDF, testDF = df.randomSplit([.7,.15,.15])
trainDF.head()

Row(CUST_ID=1009520370, SEX=u'F', AGE=63, EDUCATION=4, INVESTMENT=0, INCOME=13035, ACTIVITY=3, CHURN=0, YRLY_AMT=6339.0703, AVG_DAILY_TX=0.663014, YRLY_TX=242, AVG_TX_AMT=26.194504, NEGTWEETS=2, STATE=u'MD', EDUCATION_GROUP=u'Associate degree', TwitterID=0, CHURN_LABEL=False, INSERT_TIME=datetime.datetime(2017, 2, 9, 16, 30))

In [8]:
testDF.head()

Row(CUST_ID=1009520430, SEX=u'M', AGE=48, EDUCATION=2, INVESTMENT=11258, INCOME=20142, ACTIVITY=1, CHURN=1, YRLY_AMT=8800.02, AVG_DAILY_TX=0.654795, YRLY_TX=239, AVG_TX_AMT=36.820206, NEGTWEETS=10, STATE=u'MD', EDUCATION_GROUP=u'Bachelors degree', TwitterID=0, CHURN_LABEL=True, INSERT_TIME=datetime.datetime(2017, 2, 9, 14, 0))

In [9]:
validateDF.head()

Row(CUST_ID=1009520540, SEX=u'M', AGE=57, EDUCATION=2, INVESTMENT=5111, INCOME=22438, ACTIVITY=1, CHURN=1, YRLY_AMT=10375.398, AVG_DAILY_TX=0.687671, YRLY_TX=251, AVG_TX_AMT=41.336395, NEGTWEETS=13, STATE=u'ID', EDUCATION_GROUP=u'Bachelors degree', TwitterID=0, CHURN_LABEL=True, INSERT_TIME=datetime.datetime(2017, 2, 9, 13, 0))




### 2.2 Create pipeline and train a model ###
<p>In this section you will create an Apache® Spark machine learning pipeline and then train the model.</p>

In [10]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer, IndexToString, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline, Model

In [11]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

genderIndexer = StringIndexer(inputCol="SEX", outputCol="gender_code")
stateIndexer = StringIndexer(inputCol="STATE", outputCol="state_code")
featuresAssembler = VectorAssembler(
    inputCols=["AGE", "ACTIVITY","EDUCATION","NEGTWEETS","INCOME","gender_code","state_code"],
    outputCol="features")

<p>Next, define estimators you want to use for classification. Logistics Regression is used in the following example.</p>

In [12]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(regParam=0.01, labelCol="CHURN", featuresCol="features")

<p>Setup a Cognitive Assistant for Data Scientists - predict model performance based on sampled data</p>

In [13]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

In [14]:
pipeline = Pipeline(stages=[genderIndexer, stateIndexer, featuresAssembler, lr])
model = pipeline.fit(trainDF)

<p>You can check your model accuracy now. To evaluate the model, use test data.</p>

In [15]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

predictions = model.transform(testDF)
evaluatorRF = MulticlassClassificationEvaluator(labelCol="CHURN", predictionCol="prediction", metricName="accuracy")
accuracy = evaluatorRF.evaluate(predictions)
print "Accuracy = " + str(accuracy)
print "Test Error = " + str(1.0 - accuracy)

Accuracy = 0.932222222222
Test Error = 0.0677777777778


In [16]:
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.mllib.regression import LabeledPoint

In [17]:
predictionsAndLabels = predictions.rdd.map(lambda r: (float(r.prediction),float(r['CHURN'])))
predictionsAndLabels.take(2)

[(1.0, 1.0), (0.0, 0.0)]

In [18]:
# Instantiate metrics object
metrics = BinaryClassificationMetrics(predictionsAndLabels)

In [19]:
# Area under precision-recall curve
print("Area under PR = %s" % metrics.areaUnderPR)

# Area under ROC curve
print("Area under ROC = %s" % metrics.areaUnderROC)

Area under PR = 0.886685141641
Area under ROC = 0.908466158466



<p>Create a true positive and false positive rate</p>

In [20]:
import numpy as np
pR = [(1.0,1.0)]
for i in np.arange(0.0, 1.01, 0.01):
    preds = predictions.rdd.map(lambda r: (float(r['probability'][1]>i),float(r['CHURN']),r['probability']))
    tp = float(preds.filter(lambda r: r[0] == 1.0 and r[1] == 1.0).count())
    fp = float(preds.filter(lambda r: r[0] == 1.0 and r[1] == 0.0).count())
    fn = float(preds.filter(lambda r: r[0] == 0.0 and r[1] == 1.0).count())
    tn = float(preds.filter(lambda r: r[0] == 0.0 and r[1] == 0.0).count())
    if tp == 0.0 and fp == 0.0:
        pR.append((0.0,0.0))
    else:
        recall = tp/(tp+fn)
        falsePosRate = fp/(fp+tn)
        pR.append((falsePosRate,recall))
pR.append((0.0,0.0))


<p>Load the rocCurve into a dataframe</p>

In [21]:
import seaborn as sns
import pandas as pd
%matplotlib inline
rocDF = pd.DataFrame(pR)
rocDF = rocDF.rename(index=str, columns={0: "FalsePositiveRate", 1: "TruePositiveRate"})

<p>Display the models ROC curve on the Brunel chart setting the "False Positive Rate" and "True Positive Rate"</p>

In [22]:
import brunel
%brunel data('rocDF') x(FalsePositiveRate) y(TruePositiveRate) line tooltip(#all) axes(x:'False Positive Rate':grid, y:'True Positive Rate':grid) title('ROC')

<IPython.core.display.Javascript object>




## 3. Watson Machine Learning - Use Repository service to save model. ##
-------------------------------------


### 3.1 Set credentials to the Watson Machine Learning Deployments ##
-------------------------------------
<p>service_path = "https://ibm-watson-ml.mybluemix.net"<br>
instance_id = "xxxx"<br>
username = "xxxx"<br>
password = "xxxx"</p>

In [23]:
from repository.mlrepositoryclient import MLRepositoryClient
from repository.mlrepositoryartifact import MLRepositoryArtifact
from repository.mlrepository import MetaProps, MetaNames

In [24]:
# @hidden_cell
service_path = "https://ibm-watson-ml.mybluemix.net"
instance_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
username = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
password = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"

<p>Secure a connection to the repository and add author information for model</p>

In [25]:
ml_repository_client = MLRepositoryClient(service_path)
ml_repository_client.authorize(username, password)

In [26]:
model_artifact = MLRepositoryArtifact(model, training_data=trainDF, name="Credit Card Churn Model")

saved_model = ml_repository_client.models.save(model_artifact)

print saved_model.meta.available_props()
print
print "modelType: " + saved_model.meta.prop("modelType")
print "trainingDataSchema: " + str(saved_model.meta.prop("trainingDataSchema"))
print "creationTime: " + str(saved_model.meta.prop("creationTime"))
print "modelVersionHref: " + saved_model.meta.prop("modelVersionHref")
print "label: " + saved_model.meta.prop("label")

['inputDataSchema', 'evaluationMetrics', 'pipelineVersionHref', 'modelVersionHref', 'trainingDataRef', 'pipelineType', 'creationTime', 'lastUpdated', 'label', 'authorEmail', 'trainingDataSchema', 'authorName', 'version', 'modelType', 'runtime', 'evaluationMethod']

modelType: sparkml-model-2.1
trainingDataSchema: {u'fields': [{u'nullable': True, u'type': u'integer', u'name': u'CUST_ID', u'metadata': {}}, {u'nullable': True, u'type': u'string', u'name': u'SEX', u'metadata': {}}, {u'nullable': True, u'type': u'integer', u'name': u'AGE', u'metadata': {}}, {u'nullable': True, u'type': u'integer', u'name': u'EDUCATION', u'metadata': {}}, {u'nullable': True, u'type': u'integer', u'name': u'INVESTMENT', u'metadata': {}}, {u'nullable': True, u'type': u'integer', u'name': u'INCOME', u'metadata': {}}, {u'nullable': True, u'type': u'integer', u'name': u'ACTIVITY', u'metadata': {}}, {u'nullable': True, u'type': u'integer', u'name': u'CHURN', u'metadata': {}}, {u'nullable': True, u'type': u'double'

<div class="alert alert-block alert-info"> Tip: modelVersionHref is our model unique indentifier in the Watson Machine Learning repository.</div> 

In [27]:
saved_model = ml_repository_client.models.save(model_artifact)

<p>Get saved model metadata from Watson Machine Learning.</p>

In [28]:
saved_model.meta.available_props()

['inputDataSchema',
 'evaluationMetrics',
 'pipelineVersionHref',
 'modelVersionHref',
 'trainingDataRef',
 'pipelineType',
 'creationTime',
 'lastUpdated',
 'label',
 'authorEmail',
 'trainingDataSchema',
 'authorName',
 'version',
 'modelType',
 'runtime',
 'evaluationMethod']




### 3.2 Load model and make predictions ##
-------------------------------------

In [29]:
loadedModelArtifact = ml_repository_client.models.get(saved_model.uid)

In [30]:
print(loadedModelArtifact.name)
print(saved_model.uid)

Credit Card Churn Model
e8c6bd36-6635-401e-a5f1-f40d9ad111af


In [32]:
predictions = loadedModelArtifact.model_instance().transform(validateDF)


In [34]:
predictions.select("AGE","ACTIVITY","EDUCATION","SEX","STATE","NEGTWEETS","INCOME","CHURN").show()


+---+--------+---------+---+-----+---------+------+-----+
|AGE|ACTIVITY|EDUCATION|SEX|STATE|NEGTWEETS|INCOME|CHURN|
+---+--------+---------+---+-----+---------+------+-----+
| 57|       1|        2|  M|   ID|       13| 22438|    1|
| 41|       2|        4|  M|   KS|        4| 17312|    0|
| 53|       3|        4|  F|   ID|       10| 25854|    0|
| 66|       4|        1|  M|   VA|        3| 19061|    0|
| 35|       0|        2|  F|   MI|       11| 20414|    1|
| 70|       1|        1|  F|   NY|        4| 13770|    0|
| 73|       1|        2|  F|   OR|        5|114646|    1|
| 27|       1|        4|  F|   WV|        3| 17830|    0|
| 42|       4|        4|  F|   IN|        4| 45292|    0|
| 27|       3|        2|  M|   WV|        2| 95416|    0|
| 38|       2|        4|  F|   FL|       10| 21987|    0|
| 75|       0|        1|  M|   MD|       10| 19014|    0|
| 68|       0|        1|  F|   AR|       10| 13616|    0|
| 64|       2|        1|  F|   DE|        3| 14679|    0|
| 30|       2|


<p>Generate an access token to work with the Watson Machine Learning API</p>

In [38]:
import urllib3, requests, json

headers = urllib3.util.make_headers(basic_auth='{}:{}'.format(username, password))
url = '{}/v3/identity/token'.format(service_path)
response = requests.get(url, headers=headers)
wmltoken = 'Bearer ' + json.loads(response.text).get('token')




### 3.3 Get a WML response instance from Watson Machine Learning API ##
-------------------------------------

In [39]:
endpoint_instance = service_path + "/v3/wml_instances/" + instance_id
header = {'Content-Type': 'application/json', 'Authorization': wmltoken}

response_get_instance = requests.get(endpoint_instance, headers=header)
print response_get_instance
print response_get_instance.text

<Response [200]>
{"metadata":{"guid":"ac911531-f2b4-448d-8a97-e62041d451ee","url":"https://ibm-watson-ml.mybluemix.net/v3/wml_instances/ac911531-f2b4-448d-8a97-e62041d451ee","created_at":"2017-08-02T17:29:49.584Z","modified_at":"2017-10-18T20:52:25.568Z"},"entity":{"source":"Bluemix","published_models":{"url":"https://ibm-watson-ml.mybluemix.net/v3/wml_instances/ac911531-f2b4-448d-8a97-e62041d451ee/published_models"},"usage":{"expiration_date":"2017-11-01T00:00:00.000Z","computation_time":{"limit":18000,"current":0},"model_count":{"limit":200,"current":1},"prediction_count":{"limit":5000,"current":0},"deployment_count":{"limit":5,"current":-1}},"plan_id":"3f6acf43-ede8-413a-ac69-f8af3bb0cbfe","status":"Active","organization_guid":"17fcb73f-f48a-461d-bc35-3be8ff80d7ca","region":"us-south","account":{"id":"ef1045312d844c056ae7045756aafc47","name":"IBM","type":"PAYG"},"owner":{"user_id":"756ddd32-7fa8-4fba-a7c5-b25949939ac4","email":"natop@us.ibm.com","country_code":"USA","beta_user":true


<p>Find the deployed Models and create an access url</p>

In [40]:
endpoint_published_models = json.loads(response_get_instance.text).get('entity').get('published_models').get('url')
print endpoint_published_models


https://ibm-watson-ml.mybluemix.net/v3/wml_instances/ac911531-f2b4-448d-8a97-e62041d451ee/published_models


<p>Get a list of the published wml models</p>

In [42]:
header = {'Content-Type': 'application/json', 'Authorization': wmltoken}

response_get = requests.get(endpoint_published_models, headers=header)
print response_get
print response_get.text

<Response [200]>
{"count":5,"resources":[{"metadata":{"guid":"505c71c7-9a78-4f04-970c-f25c2afe1a6f","url":"https://ibm-watson-ml.mybluemix.net/v3/wml_instances/ac911531-f2b4-448d-8a97-e62041d451ee/published_models/505c71c7-9a78-4f04-970c-f25c2afe1a6f","created_at":"2017-10-18T19:48:48.993Z","modified_at":"2017-10-18T19:48:49.243Z"},"entity":{"runtime_environment":"spark-2.0","author":{},"name":"Credit Card Churn Model","label_col":"CHURN","training_data_schema":{"fields":[{"metadata":{},"type":"integer","name":"CUST_ID","nullable":true},{"metadata":{},"type":"string","name":"SEX","nullable":true},{"metadata":{},"type":"integer","name":"AGE","nullable":true},{"metadata":{},"type":"integer","name":"EDUCATION","nullable":true},{"metadata":{},"type":"integer","name":"INVESTMENT","nullable":true},{"metadata":{},"type":"integer","name":"INCOME","nullable":true},{"metadata":{},"type":"integer","name":"ACTIVITY","nullable":true},{"metadata":{},"type":"integer","name":"CHURN","nullable":true},{

<p>Get the published model deployment URL</p>

In [None]:
[endpoint_deployments] = [x.get('entity').get('deployments').get('url') for x in json.loads(response_get.text).get('resources') if x.get('metadata').get('guid') == saved_model.uid]

print endpoint_deployments




### 3.4 Create an Online Deployment for the Model ##
-------------------------------------

In [45]:
payload_online = {"name": "Product Line Prediction", "description": "My Cool Deployment", "type": "online"}
response_online = requests.post(endpoint_deployments, json=payload_online, headers=header)

print response_online
print response_online.text

<Response [201]>
{"metadata":{"guid":"43671c6d-0e65-4a1e-aa4b-eacc99c789ee","url":"https://ibm-watson-ml.mybluemix.net/v3/wml_instances/ac911531-f2b4-448d-8a97-e62041d451ee/published_models/e8c6bd36-6635-401e-a5f1-f40d9ad111af/deployments/43671c6d-0e65-4a1e-aa4b-eacc99c789ee","created_at":"2017-10-18T21:03:32.792Z","modified_at":"2017-10-18T21:03:34.213Z"},"entity":{"runtime_environment":"spark-2.1","name":"Product Line Prediction","scoring_url":"https://ibm-watson-ml.mybluemix.net/v3/wml_instances/ac911531-f2b4-448d-8a97-e62041d451ee/published_models/e8c6bd36-6635-401e-a5f1-f40d9ad111af/deployments/43671c6d-0e65-4a1e-aa4b-eacc99c789ee/online","description":"My Cool Deployment","published_model":{"author":{},"name":"Credit Card Churn Model","url":"https://ibm-watson-ml.mybluemix.net/v3/wml_instances/ac911531-f2b4-448d-8a97-e62041d451ee/published_models/e8c6bd36-6635-401e-a5f1-f40d9ad111af","guid":"e8c6bd36-6635-401e-a5f1-f40d9ad111af","created_at":"2017-10-18T21:03:32.765Z"},"model_typ

In [None]:
scoring_url = json.loads(response_online.text).get('entity').get('scoring_url')
print scoring_url




## 4. Create online scoring endpoint ##
-------------------------------------

<p>Create the playload_scoring json for the model</p>

In [51]:
payload_scoring = {"fields": ["AGE","ACTIVITY","EDUCATION","SEX","STATE","NEGTWEETS","INCOME","CHURN"],"values": [[41,1,4,"M","TX",4,200000,0]]}

print payload_scoring

{'fields': ['AGE', 'ACTIVITY', 'EDUCATION', 'SEX', 'STATE', 'NEGTWEETS', 'INCOME', 'CHURN'], 'values': [[41, 1, 4, 'M', 'TX', 4, 200000, 0]]}


In [50]:
response_scoring = requests.post(scoring_url, json=payload_scoring, headers=header)

print response_scoring.text

{
  "fields": ["AGE", "ACTIVITY", "EDUCATION", "SEX", "STATE", "NEGTWEETS", "INCOME", "CHURN", "gender_code", "state_code", "features", "rawPrediction", "probability", "prediction"],
  "values": [[41, 1, 4, "M", "TX", 4, 200000, 0, 0.0, 14.0, [41.0, 1.0, 4.0, 4.0, 200000.0, 0.0, 14.0], [1.9272283528574985, -1.9272283528574985], [0.8729423229726789, 0.12705767702732115], 0.0]]
}
