In [None]:
#!python -m pip install --user --upgrade pip
#!pip3 install pandas==0.23.4 matplotlib==3.0.3 scipy==1.2.1 scikit-learn==0.23.1 tensorflow==2.0 keras==1.2.2 --user

In [None]:
# install kubeflow pipeline sdk
#!pip3 install kfp --user
#!pip3 install imblearn

In [None]:
# import libraries for pipeline
import kfp
import kfp.dsl as dsl
import kfp.components as comp

In [None]:
# create  directory for outputs.
output_dir = "/home/jovyan/data/"

In [None]:
# create preprocessing fucntion

def preprocess(data_path):
    
    # Import Libraries
    
    import pickle
    import sys, subprocess;
    subprocess.run([sys.executable, '-m', 'pip', 'install','pandas==0.23.4'])
    import pandas as pd
    from datetime import datetime
    
    # import data
    
    file_1= pd.read_csv('https://raw.githubusercontent.com/HamoyeHQ/07-road-safety/master/data/Cas.csv')
    file_2= pd.read_csv('https://raw.githubusercontent.com/HamoyeHQ/07-road-safety/master/data/dftRoadSafety_Accidents_2016.csv')
    file_3= pd.read_csv("https://raw.githubusercontent.com/HamoyeHQ/07-road-safety/master/data/MakeModel2016.csv")

    pre_data= file_1.merge(file_2, on="Accident_Index",how="inner")
    dataset= pre_data.merge(file_3, on="Accident_Index", how="inner")
    
    # dropping the Accident_Index column
    dataset.drop(columns='Accident_Index', axis=1, inplace=True)
    
    pd.set_option('display.expand_frame_repr', False) 
    dataset.mask(dataset==-1, inplace=True)
    
    # dropping the NaN rows
    dataset.dropna(how='any',inplace=True)
    
    # dropping correlated and unnecessary columns for prediction
    dataset.drop(columns=['make','model','accyr','Longitude','Latitude',"Age_of_Casualty",'Pedestrian_Location','Casualty_Severity','Police_Force','Casualty_Reference', \
                      'Junction_Control','2nd_Road_Class', 'Local_Authority_(Highway)', 'LSOA_of_Accident_Location','1st_Road_Class','1st_Road_Number','2nd_Road_Number'], axis=1, inplace=True)
    # function for obtaining month in date column
    
    # Covert 'Date' to proper datetime format
    dataset['Date']= pd.to_datetime(dataset['Date'])
    
    # extract month from date
    dataset['Month'] = pd.DatetimeIndex(dataset['Date']).month
    
    # extract hour from time
    dataset['Hour_of_the_day'] = pd.to_datetime(dataset['Time']).dt.hour
    
    #  dropping time and date as they are no longer useful
    dataset.drop(columns=['Time', 'Date'],axis=1, inplace=True)
    
    
    
    # serialize clean data to output directory
    with open(f'{data_path}/clean_data','wb') as f:
        pickle.dump((dataset),f)
        
    
    return (print('Done!'))

In [None]:
preprocess(output_dir)

In [None]:
# create training and prediction function

def train_predict(data_path):
    
    # import Library
    import pickle
    import sys, subprocess;
    subprocess.run([sys.executable, '-m', 'pip', 'install','scikit-learn==0.23.1'])
    subprocess.run([sys.executable, '-m', 'pip', 'install','pandas==0.23.4'])
    subprocess.run([sys.executable, '-m', 'pip', 'install','xgboost'])
    
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.preprocessing import MinMaxScaler
    #import imblearn
    #from imblearn.over_sampling import SMOTE
    import xgboost as xgb
    from sklearn.metrics import  f1_score, accuracy_score
    from sklearn.utils import resample
    
    # deserialize clean data from output directory
    with open(f'{data_path}/clean_data','rb') as f:
        dataset = pickle.load(f)
    
    
    
    # create features and targets
    
    X= dataset.drop(columns=['Accident_Severity'])
    Y= dataset.Accident_Severity

    # split data based on y categories
    from sklearn.model_selection import train_test_split
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, stratify=Y)
    
    # normalize the dataset
    
    scaler = MinMaxScaler()
    normalised_train_df = scaler.fit_transform(x_train)
    normalised_test_df = scaler.transform(x_test)

    x_train = normalised_train_df
    x_test = normalised_test_df
    
    # Oversampling to balance the imbalanced data
    
    #smote = SMOTE(random_state=1)
    #x_train_balanced, y_balanced = smote.fit_sample(x_train, y_train)
    

    # I commented out the lines of code above cause of the attribute error it raised.
    
    #using xgboost classifier
    xg_class = xgb.XGBClassifier(objective ='binary:logistic', random_state= 1)
    xg_class.fit(x_train, y_train)
    
    # checking predicitions
    new_predictions2 =xg_class.predict(x_test)
    
    # accuracy score
    accuracy2 = accuracy_score(y_true=y_test, y_pred=new_predictions2)
    print('Accuracy: {}'.format(round(accuracy2*100), 2))
    
    # f1 score
    f1_xgb = f1_score(y_test, new_predictions2, average='micro')
    print('f1 score: {:.2f}'.format(f1_xgb))
    
    
    # write predictions to results.txt
    with open(f'{data_path}/results.txt','w') as result:
        result.write(f'Prediciton: {new_predictions2} | Actual {y_test}')
    
    
    
    return(print('Done!'))

In [None]:
train_predict(output_dir)

In [None]:
# create light weight components

preprocess_op = comp.func_to_container_op(preprocess)
train_predict_op = comp.func_to_container_op(train_predict)

In [None]:
# create client that would enable communication with the Pipelines API server 
client = kfp.Client()

In [None]:
# define pipeline
@dsl.pipeline(name="Road Safety ML Pipeline 2", description="Performs Preprocessing, training and prediction")

# Define parameters to be fed into pipeline
def road_safety_pipeline(data_path: str ):
    
    # Define volume to share data between components.
    vop = dsl.VolumeOp(
    name="create_volume",
    resource_name="data-volume", 
    size="1Gi", 
    modes=dsl.VOLUME_MODE_RWO) #RWO

    # Create preprocess components.
    road_safety_preprocess_container = preprocess_op(data_path).add_pvolumes({data_path: vop.volume})

    # Create train&prediction component.
    road_safety_train_predict_container = train_predict_op(data_path).add_pvolumes({data_path: road_safety_preprocess_container.pvolume})


    # Print the result of the prediction
    road_safety_result_container = dsl.ContainerOp(
            name="print_prediction",
            image='library/bash:4.4.23', 
            pvolumes={data_path: road_safety_train_predict_container.pvolume},
            arguments=['cat', f'{data_path}/results.txt']
    )

In [None]:
DATA_PATH ="/home/jovyan/data/clean_data"


pipeline_func = road_safety_pipeline

experiment_name = 'road_safety2_kubeflow'
run_name = pipeline_func.__name__ + ' run'

arguments = {"data_path":DATA_PATH}

# Compile pipeline to generate compressed YAML definition of the pipeline.
kfp.compiler.Compiler().compile(pipeline_func,  
  '{}.zip'.format(experiment_name))

# Submit pipeline directly from pipeline function
run_result = client.create_run_from_pipeline_func(pipeline_func, 
                                                  experiment_name=experiment_name, 
                                                  run_name=run_name, 
                                                  arguments=arguments)