In [2]:
import pandas as pd
# preprocessing
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_validate, ShuffleSplit

# models
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor 
from sklearn.neural_network import MLPRegressor

#pipelines
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import make_column_transformer


In [681]:
arriva_cleaned = pd.read_pickle("arriva_cleaned")
y = arriva_cleaned[['Totaal']]
X = arriva_cleaned[['Day_num','Source Station','Destination Station']]
X_s, _ , y_s, _ = train_test_split(X,y,train_size=0.05,shuffle=True,random_state=1)
categorical_features = ['Day_num','Source Station','Destination Station']
numerical_features = []

In [684]:
#define regressor pipeline function
def regressor_pipeline(estimator):
    
    # Creating a pipeline with one-hot-encoder for categorical data 
    categoric_transformer = make_pipeline(OneHotEncoder(sparse=False,handle_unknown='ignore'))

    # Creating a pipeline with mean imputer for numerical data 
    #numeric_transformer =  make_pipeline(StandardScaler())  
    
    # Combining both pipelines such that each pipeline works on the columns it was meant for
    preprocessor = make_column_transformer((categoric_transformer,categorical_features)
     )

    # Return the complete pipeline containing preprocessing-piepline and the estimator/classifier
    return Pipeline(steps = [('prep',preprocessor),('est',estimator)])


In [686]:
# define 4 regression models to evaluate
models = {'linear_regressor': regressor_pipeline(LinearRegression(n_jobs=-1)).fit(X_s,y_s) ,
            'neural_net_regressor': regressor_pipeline(MLPRegressor(random_state=1)).fit(X_s,y_s),
            #'svm_regressor': regressor_pipeline(SVR()).fit(X_s,y_s),
            'rf_regressor': regressor_pipeline(RandomForestRegressor(n_estimators=500,random_state=1)).fit(X_s,y_s)}


results = {'linear_regressor':[],
                'neural_net_regressor':[],
                #'svm_regressor':[],
                'rf_regressor':[] 
          }



In [None]:
for (model_name,model_pipe) in models.items():
    
    for metric in ['neg_mean_absolute_error','neg_mean_squared_error','r2']:
        
        results[model_name].append(np.mean(cross_validate(model_pipe,X, y, scoring=metric,cv=ShuffleSplit(n_splits=3,random_state=1),    return_train_score=False)['test_score']))  
       
res = pd.DataFrame(data=results)
res['metrics'] = pd.Series(['mae','mse','r2'])

print(res)
