In [1]:
import kfp
import typing

In [2]:
## Read Data

from typing import NamedTuple
from kfp.components import *

def read_data(file_name: str, df_churn_op :OutputPath()): 
        
    ## Import Required Libraries
    import pandas as pd
    import numpy as np

    df_churn = pd.read_csv(file_name)
    df_churn.to_csv(df_churn_op, index=False)

In [3]:
kfp_read_data = kfp.components.func_to_container_op(func = read_data, 
                                                          output_component_file = './read-data-func.yaml',
                                                          packages_to_install = ['numpy==1.17.2',
                                                                                 'pandas==1.0.3'])


In [4]:
from typing import NamedTuple
from kfp.components import *

def one_hot_encode(df_churn_ip: InputPath(), df_one_hot: OutputPath()):
    
    import pandas as pd
    import numpy as np
    
    
    df_churn = pd.read_csv(df_churn_ip)
    empty_cols = ['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
           'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
           'OnlineSecurity', 'OnlineBackup', 'DeviceProtection','TechSupport',
           'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
           'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']

    for i in empty_cols:
        df_churn[i]=df_churn[i].replace(" ",np.nan)

    df_churn.drop(['customerID'], axis=1, inplace=True)
    df_churn = df_churn.dropna()
    binary_cols = ['Partner','Dependents','PhoneService','PaperlessBilling']

    for i in binary_cols:
        df_churn[i] = df_churn[i].replace({"Yes":1,"No":0})

    #Encoding column 'gender'
    df_churn['gender'] = df_churn['gender'].replace({"Male":1,"Female":0})


    category_cols = ['PaymentMethod','MultipleLines','InternetService','OnlineSecurity',
                   'OnlineBackup','DeviceProtection',
                   'TechSupport','StreamingTV','StreamingMovies','Contract']

    for cc in category_cols:
        dummies = pd.get_dummies(df_churn[cc], drop_first=False)
        dummies = dummies.add_prefix("{}#".format(cc))
        df_churn.drop(cc, axis=1, inplace=True)
        df_churn = df_churn.join(dummies)

    df_churn['Churn'] = df_churn['Churn'].replace({"Yes":1,"No":0})

    df_churn.to_csv(df_one_hot, index=False)


In [5]:
kfp_one_hot_encode = kfp.components.func_to_container_op(func = one_hot_encode, 
                                                          output_component_file = './one-hot-encode-func.yaml',
                                                          packages_to_install = ['scikit-learn==0.22.2','numpy==1.17.2',
                                                                                 'pandas==1.0.3',
                                                                                 'imbalanced-learn==0.6.2'])

In [6]:
from typing import NamedTuple
from kfp.components import *


def rf_model(df_churn_ip: InputPath(), n_estimators: int, conf_matr: OutputPath(), metadata_out: OutputPath(),
             metrics_out: OutputPath()):
    import pandas as pd
    import numpy as np
    import sklearn
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import confusion_matrix, accuracy_score
    import json
    import os
    
    df_churn = pd.read_csv(df_churn_ip)
    
    df_churn.dropna(inplace=True)
    n_est = n_estimators

    y1 = df_churn['Churn']
    X1 = df_churn.drop(['Churn'],axis=1)

    X_train, X_test, y_train, y_test = train_test_split(X1, y1, random_state=0)
    rfc_best = RandomForestClassifier(random_state=42, max_features='auto', n_estimators= 50, max_depth=8,
                                    criterion='gini')

    rfc_best.fit(X_train, y_train) 
    y_test_pred = rfc_best.predict(X_test)
    rf_score = rfc_best.score(X_test, y_test)
    conf = confusion_matrix(y_test, y_test_pred)
    print(conf)

    #code to generate artifacts

    vocab = list(y_test.unique())
    cm = confusion_matrix(y_test, y_test_pred, labels=vocab)
    data = []
    for target_index, target_row in enumerate(cm):
        for predicted_index, count in enumerate(target_row):
            data.append((vocab[target_index], vocab[predicted_index], count))

    df_cm = pd.DataFrame(data, columns=['target', 'predicted', 'count'])
    print(df_cm)
    df_cm.to_csv(conf_matr, columns=['target', 'predicted', 'count'], header=False, index=False)

    #df_cm.to_csv("gs://mlopstest/confusionmatrix.csv", index=False)
    
    cm_file = os.path.join(args.output, 'confusion_matrix.csv')
    with file_io.FileIO(cm_file, 'w') as f:
        df_cm.to_csv(f, columns=['target', 'predicted', 'count'], header=False, index=False)

    metadata = {
    'outputs' : [{
      'type': 'confusion_matrix',
      'format': 'csv',
      'schema': [
        {'name': 'target', 'type': 'CATEGORY'},
        {'name': 'predicted', 'type': 'CATEGORY'},
        {'name': 'count', 'type': 'NUMBER'},
      ],
      'source': conf_matr,
      # Convert vocab to string because for bealean values we want "True|False" to match csv data.
      'labels': list(map(str, vocab)),
    }]
    }
    
    with open(metadata_out, 'w+') as f1:
        json.dump(metadata, f1)

    #json.dump(metadata, metadata_out)

    accuracy = accuracy_score(y_test, y_test_pred)
    metrics = {
    'metrics': [{
      'name': 'accuracy-score',
      'numberValue':  accuracy,
      'format': "PERCENTAGE",
    }]
    }
    #with file_io.FileIO('/mlpipeline-metrics.json', 'w') as f:
    with open(metrics_out, 'w+') as f:
        json.dump(metrics, f)

In [7]:
kfp_rf_model = kfp.components.func_to_container_op(func = rf_model, 
                                                          output_component_file = './rf-model-func.yaml', 
                                                   packages_to_install = ['scikit-learn==0.22.2','numpy==1.17.2',
                                                                                 'pandas==1.0.3',
                                                                                 'imbalanced-learn==0.6.2'])

In [None]:
# A program to generate ROC data out of prediction results.
# Usage:
# python roc.py  \
#   --predictions=gs://bradley-playground/sfpd/predictions/part-* \
#   --trueclass=ACTION \
#   --output=gs://bradley-playground/sfpd/roc/ \


import argparse
import json
import os
from urllib.parse import urlparse
import pandas as pd
from sklearn.metrics import roc_curve, roc_auc_score
from tensorflow.python.lib.io import file_io


def main(argv=None):
    parser = argparse.ArgumentParser(description='ML Trainer')
    parser.add_argument('--predictions', type=str, help='GCS path of prediction file pattern.')
    parser.add_argument('--trueclass', type=str, default='true',
                      help='The name of the class as true value. If missing, assuming it is ' +
                           'binary classification and default to "true".')
    parser.add_argument('--true_score_column', type=str, default='true',
                      help='The name of the column for positive prob. If missing, assuming it is ' +
                           'binary classification and defaults to "true".')
    parser.add_argument('--target_lambda', type=str,
                      help='a lambda function as a string to determine positive or negative.' +
                           'For example, "lambda x: x[\'a\'] and x[\'b\']". If missing, ' +
                           'input must have a "target" column.')
    parser.add_argument('--output', type=str, help='GCS path of the output directory.')
    args = parser.parse_args()

    storage_service_scheme = urlparse.urlparse(args.output).scheme
    on_cloud = True if storage_service_scheme else False
    
    if not on_cloud and not os.path.exists(args.output):
        os.makedirs(args.output)

    schema_file = os.path.join(os.path.dirname(args.predictions), 'schema.json')
    schema = json.loads(file_io.read_file_to_string(schema_file))
    names = [x['name'] for x in schema]

    if not args.target_lambda and 'target' not in names:
        raise ValueError('There is no "target" column, and target_lambda is not provided.')

    if args.true_score_column not in names:
        raise ValueError('Cannot find column name "%s"' % args.true_score_column)

    dfs = []
    files = file_io.get_matching_files(args.predictions)
    
    for file in files:
        with file_io.FileIO(file, 'r') as f:
            dfs.append(pd.read_csv(f, names=names))

    df = pd.concat(dfs)
    
    if args.target_lambda:
        df['target'] = df.apply(eval(args.target_lambda), axis=1)
    else:
        df['target'] = df['target'].apply(lambda x: 1 if x == args.trueclass else 0)
    
    
    fpr, tpr, thresholds = roc_curve(df['target'], df[args.true_score_column])
    roc_auc = roc_auc_score(df['target'], df[args.true_score_column])
    df_roc = pd.DataFrame({'fpr': fpr, 'tpr': tpr, 'thresholds': thresholds})
    roc_file = os.path.join(args.output, 'roc.csv')
    
    with file_io.FileIO(roc_file, 'w') as f:
        df_roc.to_csv(f, columns=['fpr', 'tpr', 'thresholds'], header=False, index=False)

    metadata = {
    'outputs': [{
      'type': 'roc',
      'format': 'csv',
      'schema': [
        {'name': 'fpr', 'type': 'NUMBER'},
        {'name': 'tpr', 'type': 'NUMBER'},
        {'name': 'thresholds', 'type': 'NUMBER'},
      ],
      'source': roc_file
    }]
    }
    
    with file_io.FileIO('/mlpipeline-ui-metadata.json', 'w') as f:
        json.dump(metadata, f)

    metrics = {
    'metrics': [{
      'name': 'roc-auc-score',
      'numberValue':  roc_auc,
    }]
    }
    
    with file_io.FileIO('/mlpipeline-metrics.json', 'w') as f:
        json.dump(metrics, f)

if __name__== "__main__":
    main()

In [None]:
# A program to generate confusion matrix data out of prediction results.
# Usage:
# python confusion_matrix.py  \
#   --predictions=gs://bradley-playground/sfpd/predictions/part-* \
#   --output=gs://bradley-playground/sfpd/cm/ \
#   --target=resolution \
#   --analysis=gs://bradley-playground/sfpd/analysis \


import argparse
import json
import os
import urlparse
import pandas as pd
from sklearn.metrics import confusion_matrix, accuracy_score
from tensorflow.python.lib.io import file_io


def main(argv=None):
    
    parser = argparse.ArgumentParser(description='ML Trainer')
    parser.add_argument('--predictions', type=str, help='GCS path of prediction file pattern.')
    parser.add_argument('--output', type=str, help='GCS path of the output directory.')
    parser.add_argument('--target_lambda', type=str,
                      help='a lambda function as a string to compute target.' +
                           'For example, "lambda x: x[\'a\'] + x[\'b\']"' +
                           'If not set, the input must include a "target" column.')
    
    args = parser.parse_args()

    storage_service_scheme = urlparse.urlparse(args.output).scheme
    on_cloud = True if storage_service_scheme else False
    
    if not on_cloud and not os.path.exists(args.output):
        os.makedirs(args.output)

    schema_file = os.path.join(os.path.dirname(args.predictions), 'schema.json')
    schema = json.loads(file_io.read_file_to_string(schema_file))
    names = [x['name'] for x in schema]
    dfs = []
    files = file_io.get_matching_files(args.predictions)
    
    for file in files:
        with file_io.FileIO(file, 'r') as f:
            dfs.append(pd.read_csv(f, names=names))

    df = pd.concat(dfs)
    
    if args.target_lambda:
        df['target'] = df.apply(eval(args.target_lambda), axis=1)

    vocab = list(df['target'].unique())
    cm = confusion_matrix(df['target'], df['predicted'], labels=vocab)
    data = []
    
    for target_index, target_row in enumerate(cm):
        for predicted_index, count in enumerate(target_row):
            data.append((vocab[target_index], vocab[predicted_index], count))

    df_cm = pd.DataFrame(data, columns=['target', 'predicted', 'count'])
    cm_file = os.path.join(args.output, 'confusion_matrix.csv')
    
    with file_io.FileIO(cm_file, 'w') as f:
        df_cm.to_csv(f, columns=['target', 'predicted', 'count'], header=False, index=False)

    metadata = {
    'outputs' : [{
      'type': 'confusion_matrix',
      'format': 'csv',
      'schema': [
        {'name': 'target', 'type': 'CATEGORY'},
        {'name': 'predicted', 'type': 'CATEGORY'},
        {'name': 'count', 'type': 'NUMBER'},
      ],
      'source': cm_file,
      # Convert vocab to string because for bealean values we want "True|False" to match csv data.
      'labels': list(map(str, vocab)),
    }]
    }
    
    with file_io.FileIO('/mlpipeline-ui-metadata.json', 'w') as f:
        json.dump(metadata, f)

    accuracy = accuracy_score(df['target'], df['predicted'])
    metrics = {
    'metrics': [{
      'name': 'accuracy-score',
      'numberValue':  accuracy,
      'format': "PERCENTAGE",
    }]
    }
    
    with file_io.FileIO('/mlpipeline-metrics.json', 'w') as f:
        json.dump(metrics, f)

if __name__== "__main__":
    main()


In [8]:
from typing import NamedTuple
from kfp.components import *

def xgb_model(df_churn_ip: InputPath(), n_estimators: int):
        
    import pandas as pd
    import numpy as np
    from sklearn.ensemble import RandomForestClassifier
    from imblearn.over_sampling import SMOTE
    from sklearn.model_selection import GridSearchCV
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import confusion_matrix
    import xgboost as xgb

    df_churn = pd.read_csv(df_churn_ip)
    df_churn.dropna(inplace=True)
    n_est = n_estimators

    y1 = df_churn['Churn']
    X1 = df_churn.drop(['Churn'],axis=1)

    X_train, X_test, y_train, y_test = train_test_split(X1, y1, random_state=0)


    sm = SMOTE(random_state=0)
    X_train_res, y_train_res = sm.fit_sample(X_train, y_train)
    X_test_res, y_test_res = sm.fit_sample(X_test, y_test)

    clfxg = xgb.XGBClassifier(objective='binary:logistic', verbosity=0, max_depth=2, eta = 1, silent=0)
    clfxg.fit(X_train_res, y_train_res)
    
    y_test_pred = clfxg.predict(X_test_res)
    conf = confusion_matrix(y_test_res, y_test_pred)
    print(conf)

In [9]:
kfp_xgb_model = kfp.components.func_to_container_op(func = xgb_model, 
                                                          output_component_file = './xgb-model-func.yaml',
                                                          packages_to_install = ['scikit-learn==0.22.2','numpy==1.17.2',
                                                                                 'pandas==1.0.3',
                                                                                 'imbalanced-learn==0.6.2','xgboost==1.0.2'])

In [10]:
import kfp.dsl as dsl

@dsl.pipeline(name='ML Pipeline',description='Churn predictions using Random Forest and XG Boost Algorithms')
def TChurn_func(f_n = "https://raw.githubusercontent.com/rujual/telco_churn_pipeline/master/Data1.csv", 
                n_estimators = 100):
    
    #Passing pipeline parameter and a constant value as operation arguments
    read_data_task = kfp_read_data(file_name = f_n) 
    ohe_task = kfp_one_hot_encode(read_data_task.outputs['df_churn_op'])
    rf_model_task = kfp_rf_model(ohe_task.outputs['df_one_hot'], n_estimators)
    xgb_model_task = kfp_xgb_model(ohe_task.outputs['df_one_hot'], n_estimators)

#For an operation with a single return value, the output reference can be accessed using `task.output` or `task.outputs['output_name']` syntax
#For an operation with a multiple return values, the output references can be accessed using `task.outputs['output_name']` syntax

In [11]:
pipeline_func = TChurn_func
pipeline_filename = pipeline_func.__name__+'.pipeline.tar.gz'

import kfp.compiler as comp
comp.Compiler().compile(pipeline_func, pipeline_filename)

