In [None]:
!python -m pip install --user --upgrade pip

!pip3 install pandas==0.23.4 matplotlib==3.0.3 scipy==1.2.1 scikit-learn==0.22  --user

In [None]:
!pip3 install kfp --upgrade --user

In [None]:
!which dsl-compile

In [None]:
import kfp
import kfp.dsl as dsl
import kfp.components as comp

In [None]:
out_dir = "/home/jovyan/Artificial-Neural-Network/data/out/"

In [1]:
def train(data_path, model_file):
    
    import pickle
    import sys, subprocess;
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'pandas==0.23.4'])
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'scikit-learn==0.22'])
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    %matplotlib inline
    import seaborn as sns
    import os
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import recall_score, accuracy_score
    
    df = pd.read_csv('https://github.com/sophiabj/03-presidential-election/blob/master/data/president-1976-2016.csv')
    new_dataframe = df[(df.party=='republican') + (df.party =='democrat')]
    
    new_dataframe.drop(['state_ic','notes','state','state_po', 'office','writein','candidate'],inplace=True, axis = 1)
    
    X = new_dataframe.drop('party', axis=1)
    y = new_dataframe['party']

    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    
    scaler = MinMaxScaler()

    normalised_train_df = scaler.fit_transform(x_train)
    normalised_train_df = pd.DataFrame(normalised_train_df, columns=x_train.columns)

    normalised_test_df = scaler.transform(x_test)
    normalised_test_df = pd.DataFrame(normalised_test_df, columns=x_train.columns)


    log_reg = LogisticRegression()
    log_reg.fit(normalised_train_df, y_train)
    
    test_loss, test_acc = log_reg.evaluate(normalised_test_df,  y_test, verbose=0)
    print('Test accuracy:', test_acc)

    #Save the model to the designated 
    log_reg.save(f'{data_path}/{model_file}')

    #Save the test_data as a pickle file to be used by the predict component.
    with open(f'{data_path}/test_data', 'wb') as f:
        pickle.dump((normalised_test_df,  y_test), f)

In [None]:
log_reg = train(out_dir, "model")

In [12]:
def predict(data_path, model_file):
    
    import pickle
    import numpy as np
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import recall_score, accuracy_score
    
    log_reg = log_reg.load_model(f'{data_path}/{model_file}')

    # Load and unpack the test_data
    with open(f'{data_path}/test_data','rb') as f:
        test_data = pickle.load(f)
    # Separate the X_test from y_test.
    normalised_test_df,  y_test = test_data

    # make predictions.
   new_predictions = log_reg.predict(normalised_test_df)
    # create a threshold
    new_predictions=(new_predictions>0.5)
    
   cnf_mat = confusion_matrix(y_test, new_predictions)
     
    accuracy = accuracy_score(y_true=y_test, y_pred=new_predictions)
   
    print('Accuracy: {}'.format(round(accuracy*100), 2))
    print(cnf_mat)
    
     with open(f'{data_path}/result.txt', 'w') as result:
        result.write(" Prediction: {}, Actual: {} ".format(new_predictions,y_test.astype(np.bool)))
    
    print('Prediction has been saved successfully!')
    

[[ 70 101]
 [ 79  87]]


In [None]:
predict(out_dir, "model")

In [None]:
train_op = comp.func_to_container_op(train , base_image = "python:3.7-slim")
predict_op = comp.func_to_container_op(predict , base_image = "python:3.7-slim")

In [None]:
client = kfp.Client()

In [None]:
# Define the pipeline
@dsl.pipeline(
   name='Presidential Elections Pipeline',
   description='An ML pipeline that predicts outcome of presidential elections.'
)

# Define parameters to be fed into pipeline
def presidential_election_pipeline(
    data_path: str,
    model_file: str
):
    
    # Define volume to share data between components.
    vop = dsl.VolumeOp(
    name="create_volume",
    resource_name="data-volume", 
    size="1Gi", 
    modes=dsl.VOLUME_MODE_RWO)
    
    # Create churn training component.
    presidential_elections_training_container = train_op(data_path, model_file) \
                                    .add_pvolumes({data_path: vop.volume})

    # Create Churn prediction component.
    presidential_elections_predict_container = predict_op(data_path, model_file) \
                                    .add_pvolumes({data_path: presidential_elections_training_container.pvolume})
    
    # Print the result of the prediction
    presidential_elections_result_container = dsl.ContainerOp(
        name="print_prediction",
        image='library/bash:4.4.23',
        pvolumes={data_path: presidential_elections_predict_container.pvolume},
        arguments=['cat', f'{data_path}/result.txt']
    )

In [None]:
DATA_PATH = '/mnt'
MODEL_PATH='presidential_elections_model.h5'

In [None]:
pipeline_func = presidential_elections_container_pipeline

In [None]:
run_name = pipeline_func.__name__ + ' run'

arguments = {"data_path":DATA_PATH,
             "model_file":MODEL_PATH}

# Compile pipeline to generate compressed YAML definition of the pipeline.
kfp.compiler.Compiler().compile(pipeline_func,  
  '{}.zip'.format(experiment_name))

# Submit pipeline directly from pipeline function
run_result = client.create_run_from_pipeline_func(pipeline_func, 
                                                  experiment_name=experiment_name, 
                                                  run_name=run_name, 
                                                  arguments=arguments)