<a href="https://colab.research.google.com/github/sophiabj/03-presidential-election/blob/master/Presidential%20Elections%20model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!python -m pip install --user --upgrade pip

!pip3 install pandas==0.23.4 matplotlib==3.0.3 scipy==1.2.1 scikit-learn==0.22  --user

In [None]:
!pip3 install kfp --upgrade --user

In [None]:
!which dsl-compile

In [None]:
import kfp
import kfp.dsl as dsl
import kfp.components as comp

In [None]:
out_dir = "/home/jovyan/Artificial-Neural-Network/data/out/"

In [None]:
def train(data_path, model_file):
    
    import pickle
    import sys, subprocess;
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'pandas==0.23.4'])
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'scikit-learn==0.22'])
    import pandas as pd
    import numpy as np
    
    import os
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import recall_score, accuracy_score
    
    df = pd.read_csv('https://github.com/sophiabj/03-presidential-election/blob/master/data/president-1976-2016.csv')
    new_dataframe = df[(df.party=='republican') + (df.party =='democrat')]
    
    new_dataframe.drop(['state_ic','notes','state','state_po', 'office','writein','candidate'],inplace=True, axis = 1)
    
    X = new_dataframe.drop('party', axis=1)
    y = new_dataframe['party']

    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    
    scaler = MinMaxScaler()

    normalised_train_df = scaler.fit_transform(x_train)
    normalised_train_df = pd.DataFrame(normalised_train_df, columns=x_train.columns)

    normalised_test_df = scaler.transform(x_test)
    normalised_test_df = pd.DataFrame(normalised_test_df, columns=x_train.columns)


    log_reg = LogisticRegression()
    log_reg.fit(normalised_train_df, y_train)
    
    
     #output file to path
    np.savez_compressed(f'{data_path}/preprocessed-data.npz', 
                       xtrain=normalised_train_df,
                       xtest=normalised_test_df,
                       ytrain=y_train,
                       ytest=y_test)
    print("Preprocessing Done")

     #Save the model to the designated 
    with open(f'{data_path}/{log_reg_file}', 'wb') as file:
        pickle.dump(Log_reg, file)
        
    print("Model Trained")

In [None]:
def predict(data_path, model_file):
    
    import pickle
    import numpy as np
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import recall_score, accuracy_score
    
    with open(f'{data_path}/{log_reg_file}','rb') as file:
        log_reg = pickle.load(file)

    # Load and unpack the test_data
   preprocessed_data = np.load(f'{data_path}/preprocessed-data.npz')
    x_test = preprocessed_data['xtest']
    y_test = preprocessed_data['ytest']

    #Evaluate the model and print results
     log_reg_pred = log_reg.predict(x_test)
    
   print('Model \nAccuracy score = {} \nF1_score = {}' .format(accuracy_score(y_test, log_reg_pred), f1_score(y_test, log_reg_pred)))
    
    #np.savetxt(f'{data_path}/model_result.txt', clf_pred, fmt='%1.2f')
    with open(f'{data_path}/model_result.txt', 'w') as result:
        result.write(" Prediction: {},\nActual: {} ".format(log_reg_pred,y_test))
    
    print('Prediction has been saved successfully!')
    

[[ 70 101]
 [ 79  87]]


In [None]:
train_op = comp.func_to_container_op(train , base_image = "python:3.7-slim")
predict_op = comp.func_to_container_op(predict , base_image = "python:3.7-slim")

In [None]:
client = kfp.Client()

In [None]:
# Define the pipeline
@dsl.pipeline(
   name='Presidential Elections Pipeline',
   description='An ML pipeline that predicts outcome of presidential elections.'
)

# Define parameters to be fed into pipeline
def presidential_election_pipeline(
    data_path: str,
    log_reg_file: str
):
    
    # Define volume to share data between components.
    vop = dsl.VolumeOp(
    name="create_volume",
    resource_name="data-volume", 
    size="1Gi", 
    modes=dsl.VOLUME_MODE_RWO)
    
    # Create presidential elections training component.
    presidential_elections_training_container = train_op(data_path, model_file) \
                                    .add_pvolumes({data_path: vop.volume})

    # Create presidential elections prediction component.
    presidential_elections_predict_container = predict_op(data_path, model_file) \
                                    .add_pvolumes({data_path: presidential_elections_training_container.pvolume})
    
    # Print the result of the prediction
    presidential_elections_result_container = dsl.ContainerOp(
        name="print_prediction",
        image='library/bash:4.4.23',
        pvolumes={data_path: presidential_elections_predict_container.pvolume},
        arguments=['cat', f'{data_path}/result.txt']
    )

In [None]:
DATA_PATH = '/mnt'
MODEL_PATH='presidential_elections_model.h5'

In [None]:
pipeline_func = presidential_elections_container_pipeline

In [None]:
run_name = pipeline_func.__name__ + ' run'

arguments = {"data_path":DATA_PATH,
             "model_file":MODEL_PATH}

# Compile pipeline to generate compressed YAML definition of the pipeline.
kfp.compiler.Compiler().compile(pipeline_func,  
  '{}.zip'.format(experiment_name))

# Submit pipeline directly from pipeline function
run_result = client.create_run_from_pipeline_func(pipeline_func, 
                                                  experiment_name=experiment_name, 
                                                  run_name=run_name, 
                                                  arguments=arguments)