In [6]:
## Import Required Libraries

import kfp
import typing

In [7]:
from typing import NamedTuple
from kfp.components import *


def rf_model(file_name: str, n_estimators: int) -> NamedTuple('Outputs', [('Cf1', int), ('Cf2', int),
                                                                                     ('Cf3', int), ('Cf4', int)]):
        
    import pandas as pd
    import numpy as np
    from sklearn.ensemble import RandomForestClassifier
    from imblearn.over_sampling import SMOTE
    from sklearn.model_selection import GridSearchCV
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import confusion_matrix
    import string
    import urllib

    url = "https://raw.githubusercontent.com/rujual/telco_churn/master/Data.csv"
    file = urllib.request.urlopen(url)
    l = []
    for line in file:
        decoded_line = line.decode()
        l.append(decoded_line.split(',')[:-1])
    df_churn = pd.DataFrame(l[1:], columns=l[0])
    empty_cols=['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
           'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
           'OnlineSecurity', 'OnlineBackup', 'DeviceProtection','TechSupport',
           'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
           'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']

    for i in empty_cols:
        df_churn[i]=df_churn[i].replace(" ",np.nan)

    df_churn.drop(['customerID'], axis=1, inplace=True)
    df_churn = df_churn.dropna()
    binary_cols = ['Partner','Dependents','PhoneService','PaperlessBilling']

    for i in binary_cols:
        df_churn[i] = df_churn[i].replace({"Yes":1,"No":0})

    #Encoding column 'gender'
    df_churn['gender'] = df_churn['gender'].replace({"Male":1,"Female":0})


    category_cols = ['PaymentMethod','MultipleLines','InternetService','OnlineSecurity',
                   'OnlineBackup','DeviceProtection',
                   'TechSupport','StreamingTV','StreamingMovies','Contract']

    for cc in category_cols:
        dummies = pd.get_dummies(df_churn[cc], drop_first=False)
        dummies = dummies.add_prefix("{}#".format(cc))
        df_churn.drop(cc, axis=1, inplace=True)
        df_churn = df_churn.join(dummies)

    df_churn['Churn'] = df_churn['Churn'].replace({"Yes":1,"No":0})


    df1 = df_churn.loc[:,:'Churn']
    df1_int = df1[set(df1.columns)-{'tenure','MonthlyCharges','TotalCharges'}]
    df1_float = df1[['tenure','MonthlyCharges','TotalCharges']]
    df2 = df_churn.loc[:,'PaymentMethod#Bank transfer (automatic)':]

    def get_item(a):
        return int(a)

    def get_fl(a):
        return float(a)

    df1_int = df1_int.applymap(get_item)
    df1_float = df1_float.applymap(get_fl)
    df2 = df2.applymap(get_item)
    df_churn = df1_int.join(df1_float.join(df2))
    df_churn.dropna(inplace=True)

    n_estimators = 100
    n_est = n_estimators

    y1 = df_churn['Churn']
    X1 = df_churn.drop(['Churn'],axis=1)

    X_train, X_test, y_train, y_test = train_test_split(X1, y1, random_state=0)


    sm = SMOTE(random_state=0)
    X_train_res, y_train_res = sm.fit_sample(X_train, y_train)
    X_test_res, y_test_res = sm.fit_sample(X_test, y_test)

    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_features': ['auto', 'sqrt', 'log2'],
        'max_depth' : [2,4,5,6,7,8],
        'criterion' :['gini', 'entropy']
    }

    import xgboost as xgb

    clfxg = xgb.XGBClassifier(objective='binary:logistic', verbosity=0, max_depth=2, eta = 1, silent=0)
    clfxg.fit(X_train_res, y_train_res) #, num_round, watchlist)

    y_test_pred = clfxg.predict(X_test_res)
    conf = confusion_matrix(y_test_res, y_test_pred)
    
    return (conf[0][0], conf[0][1], conf[1][0], conf[1][1])

In [8]:
kfp_rf_model = kfp.components.func_to_container_op(func = rf_model, 
                                                          output_component_file = './rf-model-func.yaml',
                                                          packages_to_install = ['scikit-learn==0.22.2','numpy==1.17.2',
                                                                                 'pandas==1.0.3',
                                                                                 'imbalanced-learn==0.6.2','urllib3==1.24.2', 'xgboost==1.0.2'])

In [10]:
import kfp.dsl as dsl

@dsl.pipeline(name='Read-Pipeline',description='ions using Random Forest Algorithm')
def Justread_func(file_name = "https://raw.githubusercontent.com/rujual/telco_churn/master/Data.csv", 
                n_estimators = 100):
    
    #Passing pipeline parameter and a constant value as operation arguments

    rf_model_task = kfp_rf_model(file_name, n_estimators = 100)

#For an operation with a single return value, the output reference can be accessed using `task.output` or `task.outputs['output_name']` syntax
#For an operation with a multiple return values, the output references can be accessed using `task.outputs['output_name']` syntax

In [11]:
pipeline_func = Justread_func
pipeline_filename = pipeline_func.__name__+'.pipeline.tar.gz'

import kfp.compiler as comp
comp.Compiler().compile(pipeline_func, pipeline_filename)