# Process leading to model evaluation

## Since splitting process in separate notebooks for ETL, feature creation, model definition, model train and model evaluation would reduce readability of the code, they will be presented in one notebook with comments explaining which part of the process is currently presented.



 




In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from itertools import product
import pyspark.sql.functions as psqlf
from pyspark.ml.feature import VectorAssembler, Normalizer, StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from keras.models import Sequential
from keras.layers import Dense, Dropout
from sklearn.preprocessing import MinMaxScaler
from repository.mlrepositoryclient import MLRepositoryClient
from repository.mlrepositoryartifact import MLRepositoryArtifact
from repository.mlrepository import MetaProps
import urllib3, requests, json, base64, ibm_boto3
from ibm_botocore.client import Config 

Using TensorFlow backend.


### Data load

In [2]:
df_1 = pd.read_csv('~/notebook/work/PS_20174392719_1491204439457_log.csv')

In [3]:
len(df_1)

6362620

Function to make stratified sample of dataset which is too big for free IBM cloud instance

In [4]:
def fold_ctrl (fun_df, folds_no):
    fun_no_of_rows = len(fun_df)
    np.random.seed(23434)
    vect = np.random.choice(range(folds_no), size = fun_no_of_rows , p=np.full((1, folds_no), 1/folds_no)[0])
    return pd.DataFrame(vect)

In [5]:
df_1 = pd.concat([df_1.groupby(['isFraud', 'step' , 'type']).apply(lambda x:fold_ctrl (x , 120  ) ).reset_index(drop=True).rename({0:'folds'}, axis=1) ,
    df_1], axis=1)

### Features engineering

Following data exploration, following features are proposed.

Features are created before load to Spark, because it looks like Keras2DML is not supporting multi-layer perception neural networks (question asked in the forum without feedback at the time of submission), therefore neural network will be trained and evaluated locally


In [6]:
df_1 = df_1.assign(hour= df_1.step % 24)\
    .assign(origNameChar= df_1["nameOrig"].str.get(0)) \
    .assign(origNameLen= df_1["nameOrig"].str.len()) \
    .assign(destNameChar= df_1["nameDest"].str.get(0)) \
    .assign(destNameLen= df_1["nameDest"].str.len()) \
    .assign(destbalDiff= df_1["newbalanceDest"] - df_1["oldbalanceDest"] - df_1["amount"]) \
    .assign(orgbalDiff= df_1["oldbalanceOrg"] - df_1["newbalanceOrig"] - df_1["amount"])#.head()#groupby('hour').count()

In [7]:
df_1.head()

Unnamed: 0,folds,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,hour,origNameChar,origNameLen,destNameChar,destNameLen,destbalDiff,orgbalDiff
0,110,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0,1,C,11,M,11,-9839.64,1.455192e-11
1,88,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0,1,C,11,M,11,-1864.28,-1.136868e-12
2,20,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0,1,C,11,C,10,-181.0,0.0
3,110,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0,1,C,10,C,9,-21363.0,0.0
4,6,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0,1,C,11,M,11,-11668.14,0.0


### Start and load data to Spark

In [8]:
spark = SparkSession\
    .builder\
    .getOrCreate()

In [9]:
spark

<pyspark.sql.session.SparkSession at 0x7f30324cf588>

In [10]:
data_subset_ctr_string = "folds < 4"
train_set_ctrl_string = "folds < 3"
test_set_ctrl_string = "folds == 3"
spark_df1 = spark.createDataFrame(df_1.query(data_subset_ctr_string))

In [11]:
spark_df1

DataFrame[folds: bigint, step: bigint, type: string, amount: double, nameOrig: string, oldbalanceOrg: double, newbalanceOrig: double, nameDest: string, oldbalanceDest: double, newbalanceDest: double, isFraud: bigint, isFlaggedFraud: bigint, hour: bigint, origNameChar: string, origNameLen: bigint, destNameChar: string, destNameLen: bigint, destbalDiff: double, orgbalDiff: double]

In [12]:
spark_df1.take(2)

[Row(folds=0, step=1, type='PAYMENT', amount=4206.84, nameOrig='C215078753', oldbalanceOrg=0.0, newbalanceOrig=0.0, nameDest='M1757317128', oldbalanceDest=0.0, newbalanceDest=0.0, isFraud=0, isFlaggedFraud=0, hour=1, origNameChar='C', origNameLen=10, destNameChar='M', destNameLen=11, destbalDiff=-4206.84, orgbalDiff=-4206.84),
 Row(folds=0, step=1, type='PAYMENT', amount=24213.67, nameOrig='C1238616099', oldbalanceOrg=0.0, newbalanceOrig=0.0, nameDest='M70695990', oldbalanceDest=0.0, newbalanceDest=0.0, isFraud=0, isFlaggedFraud=0, hour=1, origNameChar='C', origNameLen=11, destNameChar='M', destNameLen=9, destbalDiff=-24213.67, orgbalDiff=-24213.67)]

### Spark ETL pipeline

In [17]:
stringIndexer = StringIndexer(inputCol="type", outputCol="type_indexed")

In [18]:
ohe = OneHotEncoder(inputCol="type_indexed", outputCol="type_ohe")

In [19]:
vectorAssembler = VectorAssembler(inputCols=["type_ohe", "amount", "oldbalanceOrg","newbalanceOrig", "oldbalanceDest", "newbalanceDest"],
                                  outputCol="features")

In [20]:
normalizer = Normalizer(p=2.0, inputCol="features", outputCol="features_norm")

In [21]:
pipeline = Pipeline(stages=[stringIndexer, ohe, vectorAssembler, normalizer])
model_pipeline = pipeline.fit(spark_df1)

In [22]:
transformedData  = model_pipeline.transform(spark_df1).drop('type_indexed','type_ohe')
transformedData.show()

+-----+----+--------+----------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+----+------------+-----------+------------+-----------+------------------+--------------------+--------------------+--------------------+
|folds|step|    type|    amount|   nameOrig|oldbalanceOrg|newbalanceOrig|   nameDest|oldbalanceDest|newbalanceDest|isFraud|isFlaggedFraud|hour|origNameChar|origNameLen|destNameChar|destNameLen|       destbalDiff|          orgbalDiff|            features|       features_norm|
+-----+----+--------+----------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+----+------------+-----------+------------+-----------+------------------+--------------------+--------------------+--------------------+
|    0|   1| PAYMENT|   4206.84| C215078753|          0.0|           0.0|M1757317128|           0.0|           0.0|      0|             0|   1|           C|         10|    

In [24]:
transformedData.take(2)

[Row(folds=0, step=1, type='PAYMENT', amount=4206.84, nameOrig='C215078753', oldbalanceOrg=0.0, newbalanceOrig=0.0, nameDest='M1757317128', oldbalanceDest=0.0, newbalanceDest=0.0, isFraud=0, isFlaggedFraud=0, hour=1, origNameChar='C', origNameLen=10, destNameChar='M', destNameLen=11, destbalDiff=-4206.84, orgbalDiff=-4206.84, features=SparseVector(9, {1: 1.0, 4: 4206.84}), features_norm=SparseVector(9, {1: 0.0002, 4: 1.0})),
 Row(folds=0, step=1, type='PAYMENT', amount=24213.67, nameOrig='C1238616099', oldbalanceOrg=0.0, newbalanceOrig=0.0, nameDest='M70695990', oldbalanceDest=0.0, newbalanceDest=0.0, isFraud=0, isFlaggedFraud=0, hour=1, origNameChar='C', origNameLen=11, destNameChar='M', destNameLen=9, destbalDiff=-24213.67, orgbalDiff=-24213.67, features=SparseVector(9, {1: 1.0, 4: 24213.67}), features_norm=SparseVector(9, {1: 0.0, 4: 1.0}))]

### Model definition – Gradient Boosting Trees

In [26]:
gbt = GBTClassifier(maxIter=10, maxDepth=3, labelCol="isFraud", featuresCol = "features_norm", seed=42)

### Model train – Gradient Boosting Trees

In [27]:
model_gbt = gbt.fit(transformedData.filter(train_set_ctrl_string))

### Model prediction – Gradient Boosting Trees

Train set

In [28]:
model_gbt.transform(transformedData.filter(train_set_ctrl_string)).groupBy('prediction','isFraud').count().show()

+----------+-------+------+
|prediction|isFraud| count|
+----------+-------+------+
|       1.0|      1|    91|
|       0.0|      1|    85|
|       0.0|      0|152856|
+----------+-------+------+



Test set

In [29]:
model_gbt.transform(transformedData.filter(test_set_ctrl_string)).groupBy('prediction','isFraud').count().show()

+----------+-------+-----+
|prediction|isFraud|count|
+----------+-------+-----+
|       1.0|      1|   49|
|       0.0|      1|   43|
|       1.0|      0|    1|
|       0.0|      0|62812|
+----------+-------+-----+



### Model evaluation – Gradient Boosting Trees

In [149]:
def displayModelPerformance(tmp_df):
    positives = tmp_df[tmp_df.isFraud ==1]['count'].sum()
    negatives = tmp_df[tmp_df.isFraud ==0]['count'].sum()
    TP = tmp_df.query('(isFraud ==1) & (prediction ==1)')['count'].sum()
    TN = tmp_df.query('(isFraud ==0) & (prediction ==0)')['count'].sum()
    print("Sensitivity: {:6.4f} pct".format(100*TP/positives) )
    print("Specificity: {:6.4f} pct".format(100*TN/negatives) )
    print("Accuracy: {:6.4f} pct".format(100*(TN+TP)/(positives+negatives) ))

In [85]:
evaluation = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='isFraud', metricName='accuracy')

In [86]:
print ("Test set accuracy: {}".format(evaluation.evaluate(model_gbt.transform(transformedData.filter(test_set_ctrl_string)))))

Test set accuracy: 0.9993005325490819


In [81]:
displayModelPerformance(model_gbt.transform(transformedData.filter(test_set_ctrl_string)).groupBy('prediction','isFraud').count().toPandas())

Sensitivity: 53.2609 pct
Specificity: 99.9984 pct
Accuracy: 99.9301 pct


### Model definition – RandomForest

In [83]:
rf = RandomForestClassifier(numTrees=100, maxDepth=5,  featuresCol = "features_norm", labelCol="isFraud", seed=42)

### Model train – RandomForest

In [84]:
model_rf = rf.fit(transformedData.filter(train_set_ctrl_string))

### Model evaluation – RandomForest

In [87]:
print ("Test set accuracy: {}".format(evaluation.evaluate(model_rf.transform(transformedData.filter(test_set_ctrl_string)))))

Test set accuracy: 0.9993641204991655


In [88]:
displayModelPerformance(model_rf.transform(transformedData.filter(test_set_ctrl_string)).groupBy('prediction','isFraud').count().toPandas())

Sensitivity: 56.5217 pct
Specificity: 100.0000 pct
Accuracy: 99.9364 pct


### Parameters tuning and cross-validation - Gradient Boosting Trees

Defining parameters grid and cross-validation settings 

In [91]:
grid_gbt = ParamGridBuilder().addGrid(gbt.maxIter, [ 10 , 20, 25]).build()

In [92]:
cv_gbt = CrossValidator(estimator=gbt, estimatorParamMaps=grid_gbt, evaluator=evaluation, seed=747) #default no of folds used

In [93]:
cvModel_gbt = cv_gbt.fit(transformedData.filter(train_set_ctrl_string))

In [96]:
print ("Test set accuracy: {}".format(evaluation.evaluate(cvModel_gbt.transform(transformedData.filter(test_set_ctrl_string)))))

Test set accuracy: 0.9993800174866863


In [97]:
displayModelPerformance(cvModel_gbt.transform(transformedData.filter(test_set_ctrl_string)).groupBy('prediction','isFraud').count().toPandas())

Sensitivity: 58.6957 pct
Specificity: 99.9984 pct
Accuracy: 99.9380 pct


In [94]:
[ x for x in cvModel_gbt.avgMetrics]

[0.9993926208820942, 0.9994317208019394, 0.9994382635489771]

In [98]:
cvModel_gbt.transform(transformedData.filter(test_set_ctrl_string)).groupBy('prediction','isFraud').count().show()

+----------+-------+-----+
|prediction|isFraud|count|
+----------+-------+-----+
|       1.0|      1|   54|
|       0.0|      1|   38|
|       1.0|      0|    1|
|       0.0|      0|62812|
+----------+-------+-----+



### Parameters tuning and cross-validation - RandomForest

In [100]:
grid_rf = ParamGridBuilder().addGrid(rf.numTrees, [ 150 , 75]).addGrid(rf.maxDepth, [ 3 , 7]).build()   

In [101]:
cv_rf = CrossValidator(estimator=rf, estimatorParamMaps=grid_rf, evaluator=evaluation, seed=747)

In [102]:
cvModel_rf = cv_rf.fit(transformedData.filter(train_set_ctrl_string))

In [103]:
print ("Test set accuracy: {}".format(evaluation.evaluate(cvModel_rf.transform(transformedData.filter(test_set_ctrl_string)))))

Test set accuracy: 0.999523090374374


In [104]:
displayModelPerformance(cvModel_rf.transform(transformedData.filter(test_set_ctrl_string)).groupBy('prediction','isFraud').count().toPandas())

Sensitivity: 67.3913 pct
Specificity: 100.0000 pct
Accuracy: 99.9523 pct


### Features engineering  - Spark ETL

In [106]:
stringIndexer_destNameChar = StringIndexer(inputCol="destNameChar", outputCol="destNameChar_indexed")
ohe_destNameChar = OneHotEncoder(inputCol="destNameChar_indexed", outputCol="destNameChar_ohe")
vectorAssemblerFeatEng = VectorAssembler(inputCols=["type_ohe", "amount", "oldbalanceOrg","newbalanceOrig", "oldbalanceDest", 
                                                    "newbalanceDest","destNameChar_ohe","hour","origNameLen","destNameLen", "destbalDiff", "orgbalDiff"],
                                  outputCol="features")

In [107]:
pipelineFeatEng = Pipeline(stages=[stringIndexer,stringIndexer_destNameChar, ohe,  ohe_destNameChar, vectorAssemblerFeatEng, normalizer])
model_pipelineFeatEng = pipelineFeatEng.fit(spark_df1)

In [111]:
transformedDataFeatEng  = model_pipelineFeatEng.transform(spark_df1).drop('type_indexed','type_ohe','isFlaggedFraud','destNameChar_indexed','destNameChar_ohe','nameOrig','nameDest','origNameChar')
transformedDataFeatEng.show(5)

+-----+----+-------+--------+-------------+--------------+--------------+--------------+-------+----+-----------+------------+-----------+-----------+--------------------+--------------------+--------------------+
|folds|step|   type|  amount|oldbalanceOrg|newbalanceOrig|oldbalanceDest|newbalanceDest|isFraud|hour|origNameLen|destNameChar|destNameLen|destbalDiff|          orgbalDiff|            features|       features_norm|
+-----+----+-------+--------+-------------+--------------+--------------+--------------+-------+----+-----------+------------+-----------+-----------+--------------------+--------------------+--------------------+
|    0|   1|PAYMENT| 4206.84|          0.0|           0.0|           0.0|           0.0|      0|   1|         10|           M|         11|   -4206.84|            -4206.84|(15,[1,4,10,11,12...|(15,[1,4,10,11,12...|
|    0|   1|PAYMENT|24213.67|          0.0|           0.0|           0.0|           0.0|      0|   1|         11|           M|          9|  -242

In [110]:
transformedDataFeatEng.take(2)

[Row(folds=0, step=1, type='PAYMENT', amount=4206.84, oldbalanceOrg=0.0, newbalanceOrig=0.0, oldbalanceDest=0.0, newbalanceDest=0.0, isFraud=0, hour=1, origNameLen=10, destNameChar='M', destNameLen=11, destbalDiff=-4206.84, orgbalDiff=-4206.84, features=SparseVector(15, {1: 1.0, 4: 4206.84, 10: 1.0, 11: 10.0, 12: 11.0, 13: -4206.84, 14: -4206.84}), features_norm=SparseVector(15, {1: 0.0001, 4: 0.5773, 10: 0.0001, 11: 0.0014, 12: 0.0015, 13: -0.5773, 14: -0.5773})),
 Row(folds=0, step=1, type='PAYMENT', amount=24213.67, oldbalanceOrg=0.0, newbalanceOrig=0.0, oldbalanceDest=0.0, newbalanceDest=0.0, isFraud=0, hour=1, origNameLen=11, destNameChar='M', destNameLen=9, destbalDiff=-24213.67, orgbalDiff=-24213.67, features=SparseVector(15, {1: 1.0, 4: 24213.67, 10: 1.0, 11: 11.0, 12: 9.0, 13: -24213.67, 14: -24213.67}), features_norm=SparseVector(15, {1: 0.0, 4: 0.5774, 10: 0.0, 11: 0.0003, 12: 0.0002, 13: -0.5774, 14: -0.5774}))]

### Features engineering  - Gradient Boosting Trees

In [112]:
grid_gbt_FeatEng = ParamGridBuilder().addGrid(gbt.maxIter, [ 10 , 20]).build()

In [113]:
cv_gbt_FeatEng = CrossValidator(estimator=gbt, estimatorParamMaps=grid_gbt_FeatEng, evaluator=evaluation, seed=747)

In [114]:
cvModel_gbt_FeatEng = cv_gbt_FeatEng.fit(transformedDataFeatEng.filter(train_set_ctrl_string))

In [116]:
print ("Test set accuracy: {}".format(evaluation.evaluate(cvModel_gbt_FeatEng.transform(transformedDataFeatEng.filter(test_set_ctrl_string)))))

Test set accuracy: 0.9999205150623957


In [118]:
displayModelPerformance(cvModel_gbt_FeatEng.transform(transformedDataFeatEng.filter(test_set_ctrl_string)).groupBy('prediction','isFraud').count().toPandas())

Sensitivity: 94.5652 pct
Specificity: 100.0000 pct
Accuracy: 99.9921 pct


In [119]:
[ x for x in cvModel_gbt_FeatEng.avgMetrics]

[0.9999608184591238, 0.9999608184591238]

In [120]:
cvModel_gbt_FeatEng.transform(transformedDataFeatEng.filter(test_set_ctrl_string)).groupBy('prediction','isFraud').count().show()

+----------+-------+-----+
|prediction|isFraud|count|
+----------+-------+-----+
|       1.0|      1|   87|
|       0.0|      1|    5|
|       0.0|      0|62813|
+----------+-------+-----+



### Features engineering  - RandomForest

In [121]:
grid_rf_FeatEng = ParamGridBuilder().addGrid(rf.numTrees, [ 100 , 75]).addGrid(rf.maxDepth, [ 3 , 7]).build()   

In [122]:
cv_rf_FeatEng = CrossValidator(estimator=rf, estimatorParamMaps=grid_rf_FeatEng, evaluator=evaluation, seed=747)

In [123]:
cvModel_rf_FeatEng = cv_rf_FeatEng.fit(transformedDataFeatEng.filter(train_set_ctrl_string))

In [124]:
print ("Test set accuracy: {}".format(evaluation.evaluate(cvModel_rf_FeatEng.transform(transformedDataFeatEng.filter(test_set_ctrl_string)))))

Test set accuracy: 0.9999841030124791


In [125]:
displayModelPerformance(cvModel_rf_FeatEng.transform(transformedDataFeatEng.filter(test_set_ctrl_string)).groupBy('prediction','isFraud').count().toPandas())

Sensitivity: 98.9130 pct
Specificity: 100.0000 pct
Accuracy: 99.9984 pct


In [126]:
[ x for x in cvModel_rf_FeatEng.avgMetrics]

[0.9994317208019394,
 0.9999803961826967,
 0.9994382652188973,
 0.9999478129161591]

In [127]:
cvModel_rf_FeatEng.transform(transformedDataFeatEng.filter(test_set_ctrl_string)).groupBy('prediction','isFraud').count().show()

+----------+-------+-----+
|prediction|isFraud|count|
+----------+-------+-----+
|       1.0|      1|   91|
|       0.0|      1|    1|
|       0.0|      0|62813|
+----------+-------+-----+



### Parameters tuning and cross-validation - Keras

In [128]:
def scaleData(data):
    # normalize features
    scaler = MinMaxScaler(feature_range=(0, 1))
    return scaler.fit_transform(data)

Structure data in a way suitable for neural network:

In [133]:
keras_df = scaleData(np.concatenate(\
    (df_1.query(data_subset_ctr_string)[["amount", "oldbalanceOrg","newbalanceOrig", "oldbalanceDest", "newbalanceDest"]].values,  \
     df_1.query(data_subset_ctr_string)["type"].str.get_dummies().values ), axis=1))

keras_df_index = df_1.query(data_subset_ctr_string)[["folds", "isFraud"]].reset_index(drop=True)

Helper function for tuning neural network’s architecture

In [136]:
def create_keras_model(params_dict, input_dim):
    model_keras_tmp = Sequential()
    model_keras_tmp.add(Dense(params_dict['lay_1'], input_dim=input_dim, activation='relu'))
    model_keras_tmp.add(Dropout(params_dict['lay_1_dropout']))
    model_keras_tmp.add(Dense(params_dict['lay_2'], activation='relu'))
    model_keras_tmp.add(Dropout(params_dict['lay_2_dropout']))
    model_keras_tmp.add(Dense(params_dict['lay_3'], activation='relu'))
    model_keras_tmp.add(Dropout(params_dict['lay_3_dropout']))
    model_keras_tmp.add(Dense(1, activation='sigmoid'))

    model_keras_tmp.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])
    return model_keras_tmp

Grid of neural network’s parameters to search through 

In [139]:
grid_search_params_keras = pd.DataFrame(list(product([1024],
                                    np.linspace(0.1, 0.2,1),
                                    [512],
                                    np.linspace(0.1, 0.2,1),
                                    [256,128],
                                    np.linspace(0.1, 0.6,1))), 
                       columns = ['lay_1','lay_1_dropout', 
                                  'lay_2', 'lay_2_dropout',
                                  'lay_3', 'lay_3_dropout'
                                 ]).astype(object).to_dict('records')
grid_search_params_keras

[{'lay_1': 1024,
  'lay_1_dropout': 0.1,
  'lay_2': 512,
  'lay_2_dropout': 0.1,
  'lay_3': 256,
  'lay_3_dropout': 0.1},
 {'lay_1': 1024,
  'lay_1_dropout': 0.1,
  'lay_2': 512,
  'lay_2_dropout': 0.1,
  'lay_3': 128,
  'lay_3_dropout': 0.1}]

Train model

In [156]:
results_cv_keras = list()
for list_pos, param_set in enumerate(grid_search_params_keras) : #range(1)
    tmp_keras_df = pd.DataFrame()
    tmp_keras_df_predictions = pd.DataFrame()
    tmp_keras_test = np.empty( [len(keras_df_index.query(test_set_ctrl_string)) , 3])
    tmp_keras_df_predictions_test = pd.DataFrame()
    for fold_cnt in range(3):
        tmp_keras_model = create_keras_model(param_set, 10)
        tmp_keras_model.fit(keras_df[keras_df_index.query('folds != @fold_cnt and ' + train_set_ctrl_string).index, ], keras_df_index.query('folds != @fold_cnt and '  + train_set_ctrl_string)[["isFraud"]].values,
              epochs=12,
              batch_size=128)
        tmp_keras_df = tmp_keras_df.append( pd.DataFrame(tmp_keras_model.predict(keras_df[keras_df_index.query('folds == @fold_cnt').index, ], batch_size=128, verbose=1) , index = keras_df_index.query('folds == @fold_cnt').index )  ).sort_index()
        tmp_keras_test[:, fold_cnt] = tmp_keras_model.predict(keras_df[keras_df_index.query(test_set_ctrl_string).index, ], batch_size=128, verbose=1)[:,0]
    tmp_keras_df_predictions = np.array( np.concatenate((keras_df_index.query(train_set_ctrl_string)[["isFraud"]].values ,tmp_keras_df.values) , axis=1 ) >=0.5, dtype = np.int32)
    tmp_keras_df_predictions_test = np.array( np.concatenate((keras_df_index.query(test_set_ctrl_string)[["isFraud"]].values ,tmp_keras_test.mean(axis=1).reshape(len(keras_df_index.query(test_set_ctrl_string)),1)) , axis=1 ) >=0.5, dtype = np.int32)
    results_cv_keras.insert(list_pos, (sum(tmp_keras_df_predictions[:,0] == tmp_keras_df_predictions[:,1]), 
                                       sum(tmp_keras_df_predictions[:,0] != tmp_keras_df_predictions[:,1]),
                                       sum((tmp_keras_df_predictions_test[:,0] == 0) & (tmp_keras_df_predictions_test[:,1] ==0)),
                                       sum((tmp_keras_df_predictions_test[:,0] == 0) & (tmp_keras_df_predictions_test[:,1] ==1)),
                                       sum((tmp_keras_df_predictions_test[:,0] == 1) & (tmp_keras_df_predictions_test[:,1] ==0)),
                                       sum((tmp_keras_df_predictions_test[:,0] == 1) & (tmp_keras_df_predictions_test[:,1] ==1))
                                      ) )
results_cv_keras

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12

[(152911, 121, 62812, 1, 54, 38), (152905, 127, 62813, 0, 53, 39)]

In [160]:
def displayModelPerformanceKeras(tmp_list):
    tmp_index = [x[2]+x[5] for x in tmp_list].index(max([x[2]+x[5]  for x in tmp_list]))    
    positives = tmp_list[tmp_index][4] + tmp_list[tmp_index][5]
    negatives = tmp_list[tmp_index][2] + tmp_list[tmp_index][3]
    TP = tmp_list[tmp_index][5]
    TN = tmp_list[tmp_index][2]
    print("Sensitivity: {:6.4f} pct".format(100*TP/positives) )
    print("Specificity: {:6.4f} pct".format(100*TN/negatives) )
    print("Accuracy: {:6.4f} pct".format(100*(TN+TP)/(positives+negatives) ))
    #return tmp_index

In [161]:
displayModelPerformanceKeras(results_cv_keras)

Sensitivity: 42.3913 pct
Specificity: 100.0000 pct
Accuracy: 99.9157 pct


### Features engineering - Keras

In [165]:
keras_df_FeatEng = scaleData(np.concatenate(\
    (df_1.query(data_subset_ctr_string)[["amount", "oldbalanceOrg","newbalanceOrig", "oldbalanceDest", "newbalanceDest", "hour","origNameLen","destNameLen", "destbalDiff", "orgbalDiff"]].values,  \
     pd.get_dummies(df_1.query(data_subset_ctr_string)[["type","destNameChar"]]).values ), axis=1))

In [169]:
grid_search_params_keras_FeatEng = pd.DataFrame(list(product([1024],
                                    np.linspace(0.1, 0.2,1),
                                    [512],
                                    np.linspace(0.1, 0.2,1),
                                    [256, 128],
                                    np.linspace(0.1, 0.6,1))), 
                       columns = ['lay_1','lay_1_dropout', 
                                  'lay_2', 'lay_2_dropout',
                                  'lay_3', 'lay_3_dropout'
                                 ]).astype(object).to_dict('records')
grid_search_params_keras_FeatEng

[{'lay_1': 1024,
  'lay_1_dropout': 0.1,
  'lay_2': 512,
  'lay_2_dropout': 0.1,
  'lay_3': 256,
  'lay_3_dropout': 0.1},
 {'lay_1': 1024,
  'lay_1_dropout': 0.1,
  'lay_2': 512,
  'lay_2_dropout': 0.1,
  'lay_3': 128,
  'lay_3_dropout': 0.1}]

In [170]:
results_cv_keras_FeatEng = list()
for list_pos, param_set in enumerate(grid_search_params_keras_FeatEng) : #range(1)
    tmp_keras_df = pd.DataFrame()
    tmp_keras_df_predictions = pd.DataFrame()
    tmp_keras_test = np.empty( [len(keras_df_index.query(test_set_ctrl_string)) , 3])
    tmp_keras_df_predictions_test = pd.DataFrame()
    for fold_cnt in range(3):
        tmp_keras_model = create_keras_model(param_set, 17)
        tmp_keras_model.fit(keras_df_FeatEng[keras_df_index.query('folds != @fold_cnt and ' + train_set_ctrl_string).index, ], keras_df_index.query('folds != @fold_cnt and '  + train_set_ctrl_string)[["isFraud"]].values,
              epochs=12,
              batch_size=128)
        tmp_keras_df = tmp_keras_df.append( pd.DataFrame(tmp_keras_model.predict(keras_df_FeatEng[keras_df_index.query('folds == @fold_cnt').index, ], batch_size=128, verbose=1) , index = keras_df_index.query('folds == @fold_cnt').index )  ).sort_index()
        tmp_keras_test[:, fold_cnt] = tmp_keras_model.predict(keras_df_FeatEng[keras_df_index.query(test_set_ctrl_string).index, ], batch_size=128, verbose=1)[:,0]
    tmp_keras_df_predictions = np.array( np.concatenate((keras_df_index.query(train_set_ctrl_string)[["isFraud"]].values ,tmp_keras_df.values) , axis=1 ) >=0.5, dtype = np.int32)
    tmp_keras_df_predictions_test = np.array( np.concatenate((keras_df_index.query(test_set_ctrl_string)[["isFraud"]].values ,tmp_keras_test.mean(axis=1).reshape(len(keras_df_index.query(test_set_ctrl_string)),1)) , axis=1 ) >=0.5, dtype = np.int32)
    results_cv_keras_FeatEng.insert(list_pos, (sum(tmp_keras_df_predictions[:,0] == tmp_keras_df_predictions[:,1]), 
                                       sum(tmp_keras_df_predictions[:,0] != tmp_keras_df_predictions[:,1]),
                                       sum((tmp_keras_df_predictions_test[:,0] == 0) & (tmp_keras_df_predictions_test[:,1] ==0)),
                                       sum((tmp_keras_df_predictions_test[:,0] == 0) & (tmp_keras_df_predictions_test[:,1] ==1)),
                                       sum((tmp_keras_df_predictions_test[:,0] == 1) & (tmp_keras_df_predictions_test[:,1] ==0)),
                                       sum((tmp_keras_df_predictions_test[:,0] == 1) & (tmp_keras_df_predictions_test[:,1] ==1))
                                      ) )
results_cv_keras_FeatEng

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12

[(152885, 147, 62813, 0, 69, 23), (152884, 148, 62813, 0, 68, 24)]

In [171]:
displayModelPerformanceKeras(results_cv_keras_FeatEng)

Sensitivity: 26.0870 pct
Specificity: 100.0000 pct
Accuracy: 99.8919 pct


### Publishing model

Save data for scoring to Object Store

In [173]:
# @hidden_cell
cos_credentials = {
  "apikey": "by2MqrK_KQvvFwNryGGaF3vMKnyHEoqf4zXj4I2RfBHs",
  "cos_hmac_keys": {
    "access_key_id": "ce23d1002c6645cb8fc17b5dbb477e33",
    "secret_access_key": "3a200e8fb87ba25cbc3a534d75a5eb6256c2ac15c12411d7"
  },
  "endpoints": "https://cos-service.bluemix.net/endpoints",
  "iam_apikey_description": "Auto generated apikey during resource-key operation for Instance - crn:v1:bluemix:public:cloud-object-storage:global:a/a25876fc4dfd4284aa7fa16810798644:0a0616e2-2cbf-43fc-8905-08e39898d3e7::",
  "iam_apikey_name": "auto-generated-apikey-ce23d100-2c66-45cb-8fc1-7b5dbb477e33",
  "iam_role_crn": "crn:v1:bluemix:public:iam::::serviceRole:Writer",
  "iam_serviceid_crn": "crn:v1:bluemix:public:iam-identity::a/a25876fc4dfd4284aa7fa16810798644::serviceid:ServiceId-2725cd4a-70f1-44fb-b9d5-1bffe97559aa",
  "resource_instance_id": "crn:v1:bluemix:public:cloud-object-storage:global:a/a25876fc4dfd4284aa7fa16810798644:0a0616e2-2cbf-43fc-8905-08e39898d3e7::"
}

In [174]:
auth_endpoint = 'https://iam.bluemix.net/oidc/token'
service_endpoint = 'https://s3-api.us-geo.objectstorage.softlayer.net'
cos = ibm_boto3.client('s3',
                         ibm_api_key_id=cos_credentials['apikey'],
                         ibm_service_instance_id=cos_credentials['resource_instance_id'],
                         ibm_auth_endpoint=auth_endpoint,
                         config=Config(signature_version='oauth'),
                         endpoint_url=service_endpoint)

In [175]:
connection_apikey = cos_credentials['apikey']
connection_resource_instance_id = cos_credentials["resource_instance_id"]
connection_access_key_id = cos_credentials['cos_hmac_keys']['access_key_id']
connection_secret_access_key = cos_credentials['cos_hmac_keys']['secret_access_key']

In [239]:
df_1.query(test_set_ctrl_string).iloc[:400,].to_csv('data_to_score.csv', index=False)

In [None]:
cos.upload_file('data_to_score.csv','modelscorebucket','data_to_score.csv')

In [181]:
#for bucket in cos.list_buckets(IBMServiceInstanceId=cos_credentials['resource_instance_id'])['Buckets']:#
 #   print(bucket)

In [182]:
#cos.list_objects(
 #   Bucket='wertreyrtuuu-donotdelete-pr-quohl4uchw7jmz')

In [None]:
#cos.download_file(Bucket='wertreyrtuuu-donotdelete-pr-quohl4uchw7jmz',Key='WA_Fn-UseC_-Telco-Customer-Churn.csv',Filename='telco.csv')

Publish model

In [183]:
# @hidden_cell
wml_credentials = {
  "apikey": "RTClG03LkiAX5Smoi2JChk31vEYd6hKfwiz3bbsilBGK",
  "iam_apikey_description": "Auto generated apikey during resource-key operation for Instance - crn:v1:bluemix:public:pm-20:us-south:a/a25876fc4dfd4284aa7fa16810798644:ad345490-866c-4fcd-bbbc-a4d81c00b8bf::",
  "iam_apikey_name": "auto-generated-apikey-989e787a-6cd2-4d3f-93b9-891ca7239b8e",
  "iam_role_crn": "crn:v1:bluemix:public:iam::::serviceRole:Writer",
  "iam_serviceid_crn": "crn:v1:bluemix:public:iam-identity::a/a25876fc4dfd4284aa7fa16810798644::serviceid:ServiceId-a3c740ff-ceb5-41e4-8e27-48540cfd6d8a",
  "instance_id": "ad345490-866c-4fcd-bbbc-a4d81c00b8bf",
  "password": "5efdcb6d-a110-40ae-b068-58bc7e2ca421",
  "url": "https://us-south.ml.cloud.ibm.com",
  "username": "989e787a-6cd2-4d3f-93b9-891ca7239b8e"
}

In [184]:
ml_repository_client = MLRepositoryClient(wml_credentials['url'])

In [185]:
ml_repository_client.authorize(wml_credentials['username'], wml_credentials['password'])

In [193]:
pipeline_to_repository = Pipeline(stages=[stringIndexer,stringIndexer_destNameChar, ohe,  ohe_destNameChar, vectorAssemblerFeatEng, normalizer, rf])

In [194]:
model_pipeline_to_repository = pipeline_to_repository.fit(spark_df1.filter(train_set_ctrl_string))

In [195]:
pipeline_artifact = MLRepositoryArtifact(pipeline_to_repository, name="pipeline")

In [196]:
model_artifact = MLRepositoryArtifact(model_pipeline_to_repository, training_data=spark_df1.filter(train_set_ctrl_string), name="Fraudalent elec transactions", pipeline_artifact=pipeline_artifact)

In [197]:
saved_model = ml_repository_client.models.save(model_artifact)

In [198]:
print(saved_model.meta.available_props())

dict_keys(['trainingDataSchema', 'lastUpdated', 'inputDataSchema', 'evaluationMetrics', 'version', 'pipelineType', 'modelVersionHref', 'evaluationMethod', 'trainingDataRef', 'label', 'pipelineVersionHref', 'authorEmail', 'creationTime', 'modelType', 'authorName', 'runtime'])


In [199]:
print("creationTime: " + str(saved_model.meta.prop("creationTime")))
print("modelType: " + saved_model.meta.prop("modelType"))
print("pipelineType: " + saved_model.meta.prop("pipelineType"))
print("runtime: " + str(saved_model.meta.prop("runtime")))
print("label: " + saved_model.meta.prop("label"))
print("modelVersionHref: " + saved_model.meta.prop("modelVersionHref"))

creationTime: 2018-09-01 09:39:22.776000+00:00
modelType: sparkml-model-2.1
pipelineType: sparkml-pipeline-2.1
runtime: spark-2.1
label: isFraud
modelVersionHref: https://us-south.ml.cloud.ibm.com/v2/artifacts/models/34b10bb0-108b-4d20-961b-188fb56f51b1/versions/04f5d336-61cc-4dab-9b77-b2c7b64f1661


In [200]:
print(saved_model.meta.prop("trainingDataSchema"))

{'type': 'struct', 'fields': [{'metadata': {}, 'nullable': True, 'type': 'long', 'name': 'folds'}, {'metadata': {}, 'nullable': True, 'type': 'long', 'name': 'step'}, {'metadata': {}, 'nullable': True, 'type': 'string', 'name': 'type'}, {'metadata': {}, 'nullable': True, 'type': 'double', 'name': 'amount'}, {'metadata': {}, 'nullable': True, 'type': 'string', 'name': 'nameOrig'}, {'metadata': {}, 'nullable': True, 'type': 'double', 'name': 'oldbalanceOrg'}, {'metadata': {}, 'nullable': True, 'type': 'double', 'name': 'newbalanceOrig'}, {'metadata': {}, 'nullable': True, 'type': 'string', 'name': 'nameDest'}, {'metadata': {}, 'nullable': True, 'type': 'double', 'name': 'oldbalanceDest'}, {'metadata': {}, 'nullable': True, 'type': 'double', 'name': 'newbalanceDest'}, {'metadata': {}, 'nullable': True, 'type': 'long', 'name': 'isFraud'}, {'metadata': {}, 'nullable': True, 'type': 'long', 'name': 'isFlaggedFraud'}, {'metadata': {}, 'nullable': True, 'type': 'long', 'name': 'hour'}, {'metad

In [201]:
published_model_ID = saved_model.uid

print("Model Id: " + str(published_model_ID))

Model Id: 34b10bb0-108b-4d20-961b-188fb56f51b1


In [202]:
loadedModelArtifact = ml_repository_client.models.get(saved_model.uid)

In [203]:
print(str(loadedModelArtifact.name))

Fraudalent elec transactions


Model deployment

In [204]:
headers = urllib3.util.make_headers(basic_auth='{username}:{password}'.format(username=wml_credentials['username'], password=wml_credentials['password']))
url = '{url}/v3/identity/token'.format(url=wml_credentials['url'])
response = requests.get(url, headers=headers)
mltoken = json.loads(response.text).get('token')

In [205]:
endpoint_instance = wml_credentials['url'] + "/v3/wml_instances/" + wml_credentials['instance_id']
header = {'Content-Type': 'application/json', 'Authorization': 'Bearer ' + mltoken}

response_get_instance = requests.get(endpoint_instance, headers=header)

print(response_get_instance)
print(json.dumps(response_get_instance.text, indent=2))

<Response [200]>
"{\n  \"metadata\": {\n    \"guid\": \"ad345490-866c-4fcd-bbbc-a4d81c00b8bf\",\n    \"url\": \"https://us-south.ml.cloud.ibm.com/v3/wml_instances/ad345490-866c-4fcd-bbbc-a4d81c00b8bf\",\n    \"created_at\": \"2018-08-31T15:35:40.038Z\",\n    \"modified_at\": \"2018-09-01T09:39:23.808Z\"\n  },\n  \"entity\": {\n    \"source\": \"Bluemix\",\n    \"published_models\": {\n      \"url\": \"https://us-south.ml.cloud.ibm.com/v3/wml_instances/ad345490-866c-4fcd-bbbc-a4d81c00b8bf/published_models\"\n    },\n    \"usage\": {\n      \"expiration_date\": \"2018-10-01T00:00:00.000Z\",\n      \"computation_time\": {\n        \"limit\": 180000,\n        \"current\": 0\n      },\n      \"model_count\": {\n        \"limit\": 200,\n        \"current\": 2\n      },\n      \"prediction_count\": {\n        \"limit\": 5000,\n        \"current\": 0\n      },\n      \"gpu_count\": {\n        \"limit\": 8,\n        \"current\": 0\n      },\n      \"capacity_units\": {\n        \"limit\": 18000

In [206]:
endpoint_published_models = json.loads(response_get_instance.text).get('entity').get('published_models').get('url')

print(endpoint_published_models)

https://us-south.ml.cloud.ibm.com/v3/wml_instances/ad345490-866c-4fcd-bbbc-a4d81c00b8bf/published_models


In [207]:
json.loads(response_get_instance.text).get('entity').get('deployments').get('url')

'https://us-south.ml.cloud.ibm.com/v3/wml_instances/ad345490-866c-4fcd-bbbc-a4d81c00b8bf/deployments'

In [None]:
#######################
header = {'Content-Type': 'application/json', 'Authorization': 'Bearer ' + mltoken}
response_get_dep = requests.get("https://us-south.ml.cloud.ibm.com/v3/wml_instances/03442d9b-1e0e-4d6d-a4ae-a133ce579e1f/published_models/b714ce13-801b-4a9f-8c69-ab68de94ab9c/deployments", headers=header)

print(response_get_dep)
print(json.dumps(response_get_dep.text, indent=2))

In [None]:
################
response_get_depdel = requests.delete("https://us-south.ml.cloud.ibm.com/v3/wml_instances/03442d9b-1e0e-4d6d-a4ae-a133ce579e1f/deployments/c7b06e46-d0d2-4ff3-9aaf-d58e43dea89d", headers=header)



In [None]:
############
print(response_get_depdel)
print(json.dumps(response_get_depdel.text, indent=2))

In [208]:
header = {'Content-Type': 'application/json', 'Authorization': 'Bearer ' + mltoken}
response_get = requests.get(endpoint_published_models, headers=header)

print(response_get)
print(json.dumps(response_get.text, indent=2))

<Response [200]>
"{\n  \"limit\": 1000,\n  \"resources\": [{\n    \"metadata\": {\n      \"guid\": \"ffa62ad1-8721-4d93-a49c-7f5df33d6e5c\",\n      \"url\": \"https://us-south.ml.cloud.ibm.com/v3/wml_instances/ad345490-866c-4fcd-bbbc-a4d81c00b8bf/published_models/ffa62ad1-8721-4d93-a49c-7f5df33d6e5c\",\n      \"created_at\": \"2018-08-31T15:38:29.041Z\",\n      \"modified_at\": \"2018-08-31T15:45:02.698Z\"\n    },\n    \"entity\": {\n      \"runtime_environment\": \"spark-2.1\",\n      \"learning_configuration_url\": \"https://us-south.ml.cloud.ibm.com/v3/wml_instances/ad345490-866c-4fcd-bbbc-a4d81c00b8bf/published_models/ffa62ad1-8721-4d93-a49c-7f5df33d6e5c/learning_configuration\",\n      \"name\": \"Fraudalent elec transactions\",\n      \"label_col\": \"isFraud\",\n      \"learning_iterations_url\": \"https://us-south.ml.cloud.ibm.com/v3/wml_instances/ad345490-866c-4fcd-bbbc-a4d81c00b8bf/published_models/ffa62ad1-8721-4d93-a49c-7f5df33d6e5c/learning_iterations\",\n      \"training_

In [209]:
[x.get('entity').get('deployments').get('url') for x in json.loads(response_get.text).get('resources')]# if x.get('metadata').get('guid') == saved_model.uid]


['https://us-south.ml.cloud.ibm.com/v3/wml_instances/ad345490-866c-4fcd-bbbc-a4d81c00b8bf/published_models/ffa62ad1-8721-4d93-a49c-7f5df33d6e5c/deployments',
 'https://us-south.ml.cloud.ibm.com/v3/wml_instances/ad345490-866c-4fcd-bbbc-a4d81c00b8bf/published_models/34b10bb0-108b-4d20-961b-188fb56f51b1/deployments']

In [221]:
saved_model_uid = 'https://us-south.ml.cloud.ibm.com/v3/wml_instances/ad345490-866c-4fcd-bbbc-a4d81c00b8bf/published_models/34b10bb0-108b-4d20-961b-188fb56f51b1'

In [222]:
[endpoint_deployments] = [x.get('entity').get('deployments').get('url') for x in json.loads(response_get.text).get('resources') if x.get('metadata').get('url') == saved_model_uid]

print(endpoint_deployments)

https://us-south.ml.cloud.ibm.com/v3/wml_instances/ad345490-866c-4fcd-bbbc-a4d81c00b8bf/published_models/34b10bb0-108b-4d20-961b-188fb56f51b1/deployments


In [212]:
[x.get('entity').get('deployments').get('count') for x in json.loads(response_get.text).get('resources')]# if x.get('metadata').get('guid') == saved_model.uid]


[1, 0]

Create batch deployment for published model

In [213]:
# @hidden_cell
spark_credentials = {"tenant_id": "s895-0265e0bc4ba2c2-7809e8a30329",
  "tenant_id_full": "9d7fe4b3-df43-47eb-8895-0265e0bc4ba2_9e5cbf9c-ee75-4d56-80c2-7809e8a30329",
  "cluster_master_url": "https://spark.bluemix.net",
  "tenant_secret": "b87c64ae-3cbe-43e4-8c2e-f404fc5ff37d",
  "instance_id": "9d7fe4b3-df43-47eb-8895-0265e0bc4ba2",
  "plan": "ibm.SparkService.PayGoPersonal"}

In [216]:
encoded_spark_credentials_header = base64.b64encode(json.dumps({'credentials': spark_credentials, 'version': "2.1"}).encode())
result_filename = "scores.csv"

In [217]:
header_batch = {'Content-Type': 'application/json', 'Authorization': "Bearer " + mltoken, 'X-Spark-Service-Instance': encoded_spark_credentials_header}

In [218]:
payload_batch = {"type":"batch", "name": "Fraudalent Transactions Prediction", "description": "Batch Deployment", "input": {"source": { "bucket": "modelscorebucket", \
"filename": "data_to_score.csv", "inferschema":1,  "fileformat": "csv", "type": "cloudobjectstorage"}, \
"connection": {"access_key": connection_access_key_id, "secret_key": connection_secret_access_key, "url": service_endpoint}}, \
                 "output": { "target": { "bucket": "modelscorebucket", "filename": result_filename, "fileformat": "csv", "firstlineheader":"true", \
            "type": "cloudobjectstorage"}, "connection": {"access_key": connection_access_key_id, "secret_key": connection_secret_access_key, "url": service_endpoint}}}


In [223]:
response_batch = requests.post(endpoint_deployments, json=payload_batch, headers=header_batch)

print(response_batch)
print(json.loads(response_batch.text).get("entity").get("status"))

<Response [201]>
INITIALIZING


In [268]:
header_monitor_job = {'Content-Type': 'application/json', 'Authorization': "Bearer " + mltoken, 'X-Spark-Service-Instance': encoded_spark_credentials_header}

response_monitor_job = requests.get(endpoint_deployments, headers=header_monitor_job)

print(response_monitor_job)
json.loads(response_monitor_job.text).get("resources")[0]['entity']['status_details']
#print(json.dumps(response_monitor_job.text, indent=2))

<Response [200]>


{'completionCode': '200',
 'logFile': 'SubmissionID = driver-20180901044652-0488-a1c508b5-e683-449b-a014-0f457af921e5',
 'message': 'SUCCESS',
 'queueDateTime': '2018-09-01T09:46:52Z',
 'startDateTime': '2018-09-01T09:46:52Z',
 'status': 'COMPLETED'}