<a href="https://colab.research.google.com/github/HamoyeHQ/g05-used-cars/blob/master/Preprocessing_and_modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
!pip3 install kfp --upgrade --user



In [16]:
!python -m pip install --user --upgrade pip

!pip3 install pandas==0.23.4 scipy scikit-learn==0.22



In [1]:
#Output directory
output_dir = "."

In [2]:
# Import Kubeflow SDK
import kfp
import kfp.dsl as dsl
import kfp.components as comp

In [3]:
#Preprocessing component
def preprocess (data_path):
  
  # import libraries
  import pickle
  import os
  import sys, subprocess;
  #subprocess.run([sys.executable, '-m', 'pip', 'install', 'pip==20.2.4'])
  subprocess.run([sys.executable, '-m', 'pip', 'install', 'pandas==0.23.4'])
  subprocess.run([sys.executable, '-m', 'pip', 'install', 'scikit-learn==0.22'])
  subprocess.run([sys.executable, '-m', 'pip', 'install', 'numpy==1.17.1'])
    
  import pandas as pd
  import numpy as np
  from sklearn.model_selection import train_test_split
  from sklearn.preprocessing import StandardScaler
    
  # reading the dataset from the csv file
  df_new = pd.read_csv("https://raw.githubusercontent.com/sophiabj/g05-used-cars/master/data/new_vehicle.csv")
  # selecting features, X
  X = df_new.iloc[:, :-1].values
  # selecting labels, y
  y = df_new.iloc[:, -1].values

  # normalize the data
  X = StandardScaler().fit_transform(X.astype(float))

  # to split the data
  # split into train and test
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

  #Save output file to path
  np.savez_compressed(f'{data_path}/preprocessed-data.npz',
                     X_train=X_train,
                     X_test=X_test,
                     y_train=y_train,
                     y_test=y_test)
  print("Done preprocessing..")


In [4]:
preprocess(output_dir)

Done preprocessing..


In [28]:
#Training component
def train(data_path):
  import pickle
  import sys 
  import os
  
  import subprocess;
 # subprocess.run([sys.executable, '-m', 'pip', 'install', 'pip==20.2.4'])
  #subprocess.run([sys.executable, '-m', 'pip', 'install', 'pandas==0.23.4'])
 
  subprocess.run([sys.executable, '-m', 'pip', 'install', 'scipy'])
  subprocess.run([sys.executable, '-m', 'pip', 'install', 'scikit-learn'])
  
  
  subprocess.run([sys.executable, '-m', 'pip', 'install', 'numpy==1.16.1'])

  import numpy as np
  #from sklearn.linear_model import SGDRegressor
  from sklearn.ensemble import ExtraTreesRegressor
    
 
    
  preprocessed_data = np.load(f'{data_path}/preprocessed-data.npz')

  X_train = preprocessed_data['X_train']
  y_train = preprocessed_data['y_train']

  r = ExtraTreesRegressor(n_estimators=400, random_state=42)
  r.fit(X_train, y_train.ravel())

  with open(f'{data_path}/model', 'wb') as f:
       pickle.dump(r, f)
    
  print("Done training")
    
 

In [None]:
train(output_dir)

In [None]:
#Prediction component
def predict (data_path):
  import pickle
  import os
  import sys, subprocess;
  subprocess.run([sys.executable, '-m', 'pip', 'install', 'scikit-learn', 'numpy'])
  import numpy as np
  from sklearn.metrics import mean_absolute_error as MAE
  from sklearn.metrics import mean_squared_error as MSE
  from sklearn import metrics
  from sklearn.metrics import r2_score
  #from sklearn.ensemble import ExtraTreesRegressor
    

  #Load saved model
  with open(f'{data_path}/model','rb') as file:
    model = pickle.load(file)
    
  preprocessed_data = np.load(f'{data_path}/preprocessed-data.npz')
  X_test = preprocessed_data['X_test']
  y_test = preprocessed_data['y_test']


  y_predET = model.predict(X_test)

  print('Mean Absolute Error: ', round(metrics.mean_absolute_error(y_test, y_predET), 3))
  print('Mean Squared Error: ', round(metrics.mean_squared_error(y_test, y_predET), 3))
  #print('Root Mean Squared Error: ', round(np.sqrt(metrics.mean_squared_error(y_test, y_predET)), 3))
  print('R2 score: ', round(r2_score(y_test, y_predET), 3))

  #save result
  with open(f'{data_path}/model_result', 'wb') as result:
    pickle.dump(y_predET, result)
    
  
  print("Prediction saved!")


In [None]:
predict(output_dir)

In [None]:
#Packaging components
preprocess_op = comp.func_to_container_op(preprocess , base_image = "python:3.7-slim")
train_op = comp.func_to_container_op(train , base_image = "python:3.7-slim")
predict_op = comp.func_to_container_op(predict , base_image = "python:3.7-slim")

In [None]:
#connecting to kfp
import kfp
client = kfp.Client(host='2c688dd9c0eb2f26-dot-us-central2.pipelines.googleusercontent.com')

In [None]:
# Define the pipeline
@dsl.pipeline(
   name='Used Cars Pipeline',
   description='An ML pipeline that predicts price of used cars.'
)

# Define parameters to be fed into pipeline
def used_cars_container_pipeline(
    data_path: str,
    model_file: str
):
    
    # Define volume to share data between components.
    vop = dsl.VolumeOp(
    name="create_volume",
    resource_name="data-volume", 
    size="10Gi", 
    modes=dsl.VOLUME_MODE_RWO)
    
    # Create road safety training component.
    used_cars_preprocess_container = preprocess_op(data_path).add_pvolumes({data_path: vop.volume})
    
    used_cars_training_container = train_op(data_path) \
                                    .add_pvolumes({data_path: used_cars_preprocess_container.pvolume})

    # Create road safety prediction component.
    used_cars_predict_container = predict_op(data_path) \
                                    .add_pvolumes({data_path: used_cars_training_container.pvolume})
    
    # Print the result of the prediction
    used_cars_result_container = dsl.ContainerOp(
        name="print_prediction",
        image='library/bash:4.4.23',
        pvolumes={data_path: used_cars_predict_container.pvolume},
        arguments=['head', f'{data_path}/model_result']
    )

In [None]:
DATA_PATH = '/mnt'
MODEL_PATH='used_cars_model.h5'

In [None]:
pipeline_func = used_cars_container_pipeline

In [None]:
experiment_name = 'used_cars_kubeflow'
run_name = pipeline_func.__name__ + ' run'

arguments = {"data_path":DATA_PATH,
             "model_file":MODEL_PATH}

# Compile pipeline to generate compressed YAML definition of the pipeline.
kfp.compiler.Compiler().compile(pipeline_func,  
  '{}.zip'.format(experiment_name))

# Submit pipeline directly from pipeline function
run_result = client.create_run_from_pipeline_func(pipeline_func, 
                                                  experiment_name=experiment_name, 
                                                  run_name=run_name, 
                                                  arguments=arguments)