In [1]:
# Telco Churn Pipeline

import kfp

In [2]:
## Import Required Libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import typing

In [3]:
## Read Data

from typing import NamedTuple
from kfp.components import *

def read_data(file_name: InputPath('CSV')) -> 'pd.DataFrame':   
    
    #OutputPath('CSV'):
        # -> NamedTuple('Outputs', [('Cols_drop', int),('Cols_retained', int)]):
    
    ## Import Required Libraries
    import pandas as pd
    import matplotlib.pyplot as plt
    import numpy as np
    import sklearn
    
    #This line may cause problems as file is on the system and not inside container

    df_churn = pd.read_csv(file_name)
    col1 = len(df_churn.columns)
    df_churn = df_churn.drop(columns=[])
    
    empty_cols=['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
           'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
           'OnlineSecurity', 'OnlineBackup', 'DeviceProtection','TechSupport',
           'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
           'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']
    
    for i in empty_cols:
        df_churn[i]=df_churn[i].replace(" ",np.nan)

    df_churn.drop('customerID','cluster number', axis=1, inplace=True)
    df_churn = df_churn.dropna()
    
    col2 = len(df.columns)
    #df_churn.to_csv('Cleaned_data.csv')
    #out_path = "./Cleaned_data.csv"
    
    return df_churn


In [4]:
kfp_read_data = kfp.components.create_component_from_func(func = read_data, 
                                                          output_component_file = './read-data-func.yaml',
                                                          base_image = 'fastgenomics/sklearn',
                                                          packages_to_install = ['pandas','matplotlib','numpy','scikit-learn'])

read_data_task = kfp_read_data(file_name = 'https://raw.githubusercontent.com/rujual/telco_churn/master/Data.csv')    #, out_file_name = 'Cleaned_data.csv')
#,out_file_name = 'Cleaned_data.csv')

In [8]:
## One-Hot-Encode

from typing import NamedTuple
from kfp.components import *

def one_hot_encode(input_df: 'pd.DataFrame') -> 'pd.DataFrame': #file_name: InputPath('CSV')) -> OutputPath:
                   
#                    out_file1_name: str, 
#                    out_file2_name: str) -> NamedTuple('Outputs',
#                                                       [('out_file1_name', OutputPath('CSV')),
#                                                        ('out_file2_name', OutputPath('CSV'))]):
    #out_file2_name: OutputPath('CSV')) -> None:
    
    ## Import Required Libraries
    import pandas as pd
    import matplotlib.pyplot as plt
    import numpy as np
    import sklearn
    
    df_churn = input_df #pd.read_csv(file_name)  
    
    binary_cols = ['Partner','Dependents','PhoneService','PaperlessBilling']

    for i in binary_cols:
        df_churn[i] = df_churn[i].replace({"Yes":1,"No":0})

    #Encoding column 'gender'
    df_churn['gender'] = df_churn['gender'].replace({"Male":1,"Female":0})


    category_cols = ['PaymentMethod','MultipleLines','InternetService','OnlineSecurity',
                   'OnlineBackup','DeviceProtection',
                   'TechSupport','StreamingTV','StreamingMovies','Contract']

    for cc in category_cols:
        dummies = pd.get_dummies(df_churn[cc], drop_first=False)
        dummies = dummies.add_prefix("{}#".format(cc))
        df_churn.drop(cc, axis=1, inplace=True)
        df_churn = df_churn.join(dummies)
    
    df_churn['Churn'] = df_churn['Churn'].replace({"Yes":1,"No":0})

    
    #saving files may need a PV allocation to container
    #output of files as Named tuple may cause problems    
    
    #df_churn.to_csv('Oht_enc_data.csv')
    #out_path = "./Oht_enc_data.csv"
    return df_churn #out_path

In [11]:
kfp_one_hot_encode = kfp.components.create_component_from_func(func = one_hot_encode, 
                                                          output_component_file = './one-hot-encode-func.yaml',
                                                          base_image = 'fastgenomics/sklearn',
                                                          packages_to_install = ['pandas','matplotlib','numpy','scikit-learn'])
one_hot_encode_task = kfp_one_hot_encode(read_data_task.outputs) #'Oht_enc_data.csv')  #,'One_Hot_encoded_data.csv','Churn_flags.csv')

In [12]:
## Random Forest Model
import numpy as np
from typing import NamedTuple
def rf_model(input_df: 'pd.DataFrame', n_estimators: int = 100) -> NamedTuple('Outputs', [('Cf1', int), ('Cf2', int),
                                                                                     ('Cf3', int), ('Cf4', int)]):
#file_name: InputPath('CSV'), n_estimators: int) 
#ip_file1: InputPath('CSV'), ip_file2: InputPath('CSV'), modelopfile: OutputPath('joblib'))-> None:
    from sklearn.ensemble import RandomForestClassifier
    import joblib
    from imblearn.over_sampling import SMOTE
    from sklearn.model_selection import GridSearchCV
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import confusion_matrix
    
    df_churn = input_df #pd.read_csv(file_name)
    n_est = n_estimators
    y1 = df_churn['Churn']
    X1 = dfc_churn.drop(['churn_flag'],axis=1)
    
    X_train, X_test, y_train, y_test = train_test_split(X1, y1, random_state=0)
    
    sm = SMOTE(random_state=0)
    X_train_res, y_train_res = sm.fit_sample(X_train, y_train)

    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_features': ['auto', 'sqrt', 'log2'],
        'max_depth' : [2,4,5,6,7,8],
        'criterion' :['gini', 'entropy']
    }


    rfc=RandomForestClassifier(random_state=42,n_estimators=n_est)
    gsv_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
    rfc.fit(X_train_res, y_train_res)


    #rfc_best = gsv_rfc.best_estimator_
    rfc_best=RandomForestClassifier(random_state=42, max_features='auto', n_estimators= 50, max_depth=8,
                                    criterion='gini')

    rfc_best.fit(X_train_res, y_train_res)
    X_test_res, y_test_res = sm.fit_sample(X_test, y_test)
    y_test_pred = rfc_best.predict(X_test_res, y_test_res)
    rf_score = rfc_best.score(X_test_res, y_test_res)
    conf = confusion_matrix(y_test_res, y_test_pred)

    return (conf[0][0],conf[0][1],conf[1][0],conf[1][1])

In [13]:
    #dump the trained model in pickle file
    #joblib.dump(rfc_best, modelopfile)

In [15]:
kfp_rf_model = kfp.components.create_component_from_func(func = rf_model, 
                                                          output_component_file = './rf-model-func.yaml',
                                                          base_image = 'fastgenomics/sklearn',
                                                          packages_to_install = ['pandas','matplotlib','numpy','scikit-learn','imbalanced-learn'])
rf_model_task = kfp_rf_model(one_hot_encode_task.outputs, 100)     #('One_Hot_encoded_data.csv') #,'Churn_flags.csv','model.joblib')

In [26]:
import kfp.dsl as dsl

@dsl.pipeline(name='Telco-Churn-Pipeline',description='A pipeline that processes Telco Churn dataset from Kaggle and performs ML-Predictions using Random Forest Algorithm')
def Telco_Churn(file_name = "https://raw.githubusercontent.com/rujual/telco_churn/master/Data.csv", 
                n_estimators = 100):
    read_data_task = kfp_read_data(file_name)
    one_hot_encode_task = kfp_one_hot_encode(read_data_task.output)
    rf_model_task = kfp_rf_model(one_hot_encode_task.output, n_estimators = 100)
    


In [27]:
pipeline_func = Telco_Churn
pipeline_filename = pipeline_func.__name__+'.pipeline.'

import kfp.compiler as comp
comp.Compiler().compile(pipeline_func, pipeline_filename)  #, package_path='/home/My_Workplace/Telco_churn/')