# Telco Churn Pipeline

In [1]:
import kfp
## Import Required Libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import typing

## Read Data

In [9]:
from typing import NamedTuple

def add_multiply_two_numbers(a: float, b: float) -> NamedTuple('Outputs', [('sum', float), ('product', float)]):
    """Returns sum and product of two arguments"""
    return (a + b, a * b)

add_multiply_op = create_component_from_func(add_multiply_two_numbers)

# The component function can be called with arguments to create a task:
add_multiply_task = add_multiply_op(1, 3)

In [11]:
print(add_multiply_op(1, 3))

TaskSpec(component_ref=ComponentReference(name=None, digest=None, tag=None, url=None, spec=ComponentSpec(name='Add multiply two numbers', description='Returns sum and product of two arguments', metadata=None, inputs=[InputSpec(name='a', type='Float', description=None, default=None, optional=False), InputSpec(name='b', type='Float', description=None, default=None, optional=False)], outputs=[OutputSpec(name='sum', type='Float', description=None), OutputSpec(name='product', type='Float', description=None)], implementation=ContainerImplementation(container=ContainerSpec(image='tensorflow/tensorflow:1.13.2-py3', command=['python3', '-u', '-c', 'def add_multiply_two_numbers(a , b )      :\n    """Returns sum and product of two arguments"""\n    return (a + b, a * b)\n\ndef _serialize_float(float_value: float) -> str:\n    if isinstance(float_value, str):\n        return float_value\n    if not isinstance(float_value, (float, int)):\n        raise TypeError(\'Value "{}" has type "{}" instead 

In [15]:
from typing import NamedTuple
from kfp.components import *

def read_data_func(file_name: InputPath('CSV')-> OutputPath('CSV'):
                   #-> NamedTuple('Outputs',
                    #                                                                       [('Cols_drop', int),
                    #                                                                        ('Cols_retained', int)]):


    ## Import Required Libraries
    import pandas as pd
    import matplotlib.pyplot as plt
    import numpy as np
    import sklearn
    
    #This line may cause problems as file is on the system and not inside container

    df = pd.read_csv(file_name)
    col1 = len(df.columns)
    df=df.drop(columns=['merchant_number','visa_interchange_level','avg_cb_case_type_others','avg_cb_status_others',
     'cross_border_trans',
     'domestic_trans',
     'clean_trans',
     'error_trans',
     'purchase_trans',
     'refund_trans',
     'other_trans',
     'avg_amt_per_tran',
     'avg_amt_per_day',
     'avg_amt_per_month',
     'avg_trans_per_day',
     'avg_trans_per_month',
     'avg_chargeback_amount',
     'avg_cb_received_days',
     'avg_cb_resolve_days'])
    
    col2 = len(df.columns)
    df.to_csv(out_file_name)
    
    return (col1-col2, col2)


In [16]:
kfp_read_data = kfp.components.create_component_from_func(func = read_data_func, 
                                                          output_component_file = './read-data-func.yaml',
                                                          base_image = 'fastgenomics/sklearn',
                                                          packages_to_install = ['pandas','matplotlib','numpy','sklearn'])

read_data_task = kfp_read_data('Merchant_Attrition_Data.csv','Cleaned_data.csv')

## One-Hot-Encode

In [0]:
from typing import NamedTuple

def one_hot_encode(file_name: InputPath('CSV'), out_file1_name: OutputPath('CSV'), 
                   out_file2_name: OutputPath('CSV')) -> None:
    ## Import Required Libraries
    import pandas as pd
    import matplotlib.pyplot as plt
    import numpy as np
    import sklearn
    
    X = pd.read_csv(file_name)  
    y = X['churn_flag']
    X = X.drop(['churn_flag'],axis=1)
    X_dummies = pd.get_dummies(X,drop_first=True)
    X1 = X_dummies.apply(pd.to_numeric, errors='coerce')
    y1 = y.apply(pd.to_numeric, errors='coerce')
    
    #saving files may need a PV allocation to container
    
    X1.to_csv(out_file1_name, mode='w+')
    y1.to_csv(out_file2_name, mode='w+')


In [0]:
kfp_one_hot_encode = kfp.components.create_component_from_func(func = one_hot_encode, 
                                                          output_component_file = './one-hot-encode-func.yaml',
                                                          base_image = 'fastgenomics/sklearn',
                                                          packages_to_install = ['pandas','matplotlib','numpy','sklearn'])
one_hot_encode_task = kfp_one_hot_encode('Cleaned_data.csv','One_Hot_encoded_data.csv','Churn_flags.csv')

## Random Forest Model

In [0]:
from typing import NamedTuple
def rf_model(ip_file1: InputPath('CSV'), ip_file2: InputPath('CSV'), modelopfile: OutputPath('joblib'))-> None:
    from sklearn.ensemble import RandomForestClassifier
    import joblib
    X1 = pd.read_csv(ip_file1)
    y1 = pd.read_csv(ip_file2)
    
    rf = RandomForestClassifier()
    rf.fit(X1, y1)
    joblib.dump(rf, modelopfile)


In [0]:
kfp_rf_model = kfp.components.create_component_from_func(func = rf_model_train, 
                                                          output_component_file = './rf-model-func.yaml',
                                                          base_image = 'fastgenomics/sklearn',
                                                          packages_to_install = ['pandas','matplotlib','numpy','sklearn','joblib'])

rf_model_task = kfp_rf_model('One_Hot_encoded_data.csv','Churn_flags.csv','model.joblib')