In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_validate, ShuffleSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV


from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_validate, ShuffleSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import pandas as pd
# preprocessing
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_validate, ShuffleSplit

# models
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor 
from sklearn.neural_network import MLPRegressor


#pipelines
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import make_column_transformer


import itertools
from joblib import dump,load

In [2]:

def get_occupancy_df(lines,directions,line_stop_dict,cleaned_df = None):
    occupancy_df = pd.DataFrame(columns = ['Line','Direction','Date','Station','Occupancy'])

    if cleaned_df is None:
        return None
    
    for line in lines:

            stops_direction_pair = list()
            for elem in itertools.permutations(line_stop_dict[line].items(),2):
                if elem[0][1] < elem[1][1]:
                    stops_direction_pair.append((elem[0],elem[1],'original')) 
                if elem[0][1] > elem[1][1]:
                    stops_direction_pair.append((elem[0],elem[1],'reverse'))

            for direction in directions:
                stops= [elem for elem in stops_direction_pair if elem[2]==direction]
                temp_df = pd.DataFrame()
                for elem in stops:
                    temp_df = temp_df.append(cleaned_df[(cleaned_df['Source']==elem[0][0]) &(cleaned_df['Destination']==elem[1][0])])
                temp_df['Source Num'] = [line_stop_dict[line][stop] for stop in temp_df['Source']] 
                temp_df['Destination Num'] = [line_stop_dict[line][stop] for stop in temp_df['Destination']]
                
                for date in temp_df['Date'].unique():

                    for stop in pd.Index(temp_df['Source']).union(pd.Index(temp_df['Destination'])).unique():

                        if direction == 'original':
                            occupancy = temp_df[(temp_df['Date'] == date)&( temp_df['Source Num']  <= line_stop_dict[line][stop] )]['Count'].sum() - temp_df[(temp_df['Date'] == date) & (temp_df['Destination Num'] < line_stop_dict[line][stop]) ]['Count'].sum()

                        if direction == 'reverse':
                            occupancy = temp_df[(temp_df['Date'] == date)&( temp_df['Source Num']  >= line_stop_dict[line][stop] )]['Count'].sum() - temp_df[(temp_df['Date'] == date) & (temp_df['Destination Num'] > line_stop_dict[line][stop]) ]['Count'].sum()

                        occupancy_df = occupancy_df.append({'Line':line,'Direction':direction,'Date':date,'Station':stop,'Station Num':int(line_stop_dict[line][stop]),'Occupancy':occupancy},ignore_index=True)

    occupancy_df['Day']=pd.Series(pd.DatetimeIndex(occupancy_df['Date'])).dt.day_name()
    return occupancy_df


In [3]:
def get_merged_df(occupancy_9292,occupancy_arriva):
    return pd.merge(occupancy_9292,occupancy_arriva,how='inner',on=['Line','Direction','Date','Day','Station Num','Station'])

In [4]:
def get_candidate_models(params=None):
    if params == None:
        return None

    regressors = {'Default MLP':MLPRegressor().fit(x.reshape(-1,1),y),
              'Tuned MLP': GridSearchCV(MLPRegressor(),params).fit(x.reshape(-1,1),y),
              'Linear Regressor': LinearRegression().fit(x.reshape(-1,1),y)
             }
    return regressors

In [5]:
def regression_scores(x,y,regressors):
    results = dict() 
    for (model_name,model_pipe) in regressors.items():
        for metric in ['r2']: 
            results[model_name]=np.mean(cross_validate(model_pipe,x.reshape(-1,1), y, scoring=metric,cv=2, return_train_score=False)['test_score'])

    res = pd.DataFrame(data=results,index=pd.Series([0]))
    res['metrics'] = pd.Series(['r2'])

    return res

In [6]:
def regression_plot(x,y,xlabel='X',ylabel='Y',logy=False,regressors=None):
    fig, ax = plt.subplots()    
    ax.plot(x,y, linewidth=0, marker='v', label='Data points')
       
    if regressors is not None:

        max_x = np.max(x)
        for regressor in regressors.items():
            #regressor = regressor[1].fit(x.reshape(-1,1),y)
            ax.plot(range(int(max_x)), regressor[1].predict(np.array(range(int(max_x))).reshape(-1,1)) ,label=regressor[0])

    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    if logy:
        ax.set_yscale('log')
    ax.legend(facecolor='white')

In [7]:
def get_occupancy_prediction(line,day,hour,source,destination,merged_df,cleaned_9292,cleaned_arriva,model):
    if source == destination:
        return -1
    if line != '4':
        return -1

    source_num = line_stop_dict[line][source]
    dest_num = line_stop_dict[line][destination]

    if source_num < dest_num:
        direction = 'original'
    if source_num > dest_num:
        return -1
        

    res_df = merged_df[(merged_df['Station Num']>=source_num)&(merged_df['Station Num']<dest_num)&(merged_df['Day']==day)]
# &(merged_df['Exploitatiedatum']==date)

    return np.mean(model.predict(
                    np.array([res_df['Occupancy_x'][index] for index in res_df.index]).reshape(-1,1)))* cleaned_9292[(cleaned_9292['Line']==int(line))&(cleaned_9292['Day']==day)&(cleaned_9292['Modaliteitsnummer']==2)&(cleaned_9292['Hour']==hour)].groupby('Hour').sum()['Count'].mean()/cleaned_9292[(cleaned_9292['Line']==int(line))&(cleaned_9292['Day']==day)&(cleaned_9292['Modaliteitsnummer']==2)].groupby('Hour').sum()['Count'].mean()


In [None]:
line_names = ['4','2','3','5','7','8','9',"1 Tolberg West-Station-Kortendijk","2 Langdonk-Station-Tolberg Oost","3 Kortendijk-Station-Kroeven","4 Station-WVS-Station","833"]
line_stops = [
    [           'Breda, Dreef','Breda, Nieuwe Heilaarstraat',
                'Breda, Woonboulevard',
                'Breda, Liesboslaan',
                'Breda, Ambachtenlaan',
                'Breda, Doelen',
                'Breda, Hovenierstraat',
                'Breda, Burgemeester Sutoriusstraat',
                'Breda, Flierstraat',
                'Breda, Mgr.Nolensplein',
                'Breda, Heuvelbrink',
                'Breda, Dr. Struyckenplein',
                'Breda, Bontekoestraat',
                'Breda, Amphia Zkh. Langendijk',
                'Breda, Langendijk',
                'Breda, Graaf Hendrik III Laan',
                'Breda, Grote Spie',
                'Breda, Irenestraat',
                'Breda, Markendaalseweg',
                'Breda, Centrum',
                'Breda, Vlaszak',
                'Breda, Centraal Station',
                'Breda, Belcrumweg',
                'Breda, Konijnenberg',
                'Breda, Spinveld',
                'Breda, Donk',
                'Breda, Heienlangdonk',
                'Breda, Somerweide',
                'Breda, Noortberghmoeren',
                'Breda, Cannaertserf',
                'Breda, Komoord',
                'Breda, Dwarsdijk',
                'Breda, Emerparklaan',
                'Breda, Heksenwiellaan']
    
    ,

["Goirle, Baronielaan",
"Breda, Groenedijkplein",
"Breda, Cornelis Florisstraat",
"Breda, Tilman Suysstraat",
"Breda, Waterdonken",
"Breda, Hendrik Berlagestraat",
"Breda, Lieven de Keystraat",
"Breda, Lelystraat",
"Breda, Verviersstraat",
"Breda, Doornboslaan",
"Breda, Centraal Station",
"Breda, Vlaszak",
"Breda, Centrum",
"Breda, Fellenoordstraat",
"Breda, Nieuwe Haagdijk",
"Breda, Schorsmolenstraat",
"Breda, Dijkplein",
"Breda, Lunetstraat",
"Breda, Stadion NAC",
"Breda, Peerdsbroek",
"Breda, Bremtuin",
"Breda, Wandelakker",
"Breda, Elzenbroek",
"Breda, Wilgenbroek",
"Breda, Westerhagelaan NS",
"Breda, Laaghamlaan",
"Breda, Alexberg",
"Breda, Muizenberglaan",
"Breda, Paradijslaan",
"Breda, Regenwulp",
"Breda, Overkroetenlaan",
"Breda, Lange Slagen",
"Breda, Zeggeveen",
"Breda, Heksenwiellaan"]

,

["Breda, Centraal Station",
"Breda, Nieuwe Prinsenkade",
"Breda, Tolbrug",
"Breda, Centrum",
"Breda, Vlaszak",
"Breda, Centraal Station",
"Breda, Sophiastraat",
"Breda, Nassausingel", 
"Breda, St.Ignatiusstraat",
"Breda, Brabantplein",
"Breda, Mgr. Leijtenstraat",
"Breda, Lage Kant",
"Breda, Langveld",
"Breda, Wilderen",
"Breda, Draaiboom",
"Breda, Sterrebos",
"Breda, Heerbaan"]
,
["‘s-Hertogenbosch, Centraal Station",
"Breda, Centraal Station",
"Breda, Vlaszak",
"Breda, Centrum",
"Breda, Markendaalseweg",
"Breda, Irenestraat",
"Breda, Graaf Hendrik III Laan",
"Breda, Grote Spie",
"Breda, Verdilaan",
"Breda, Willem van Oranjelaan",
"Breda, Duivelsbruglaan",
"Breda, Bouvigne",
"Breda, Galderseweg",
"Breda, De Blauwe Kamer",
"Breda, De Klokkenberg",
"Galder, Galderse Meren",
"Galder, Smidshof",
"Galder, Ballemanseweg",
"Galder, Markweg",
"Meersel Dreef, Nieuw Dreef"]
,
["Breda, Nieuw Wolfslaarlaan",
"Breda, Muiderslotstraat",
"Breda, Walenburgstraat",
"Breda, Zwijnsbergenstraat",
"Breda, Mariaplein",
"Breda, Marialaan",
"Breda, Valkeniersplein",
"Breda, Van Duijvenvoordestraat",
"Breda, Prins Hendrikstraat",
"Breda, Ginnekenweg",
"Breda, Wilhelminastraat",
"Breda, Nwe Ginnekenstraat",
"Breda, Centrum",
"Breda, Vlaszak",
"Breda, Centraal Station",
"Breda, Sophiastraat",
"Breda, Nassausingel", 
"Breda, St.Ignatiusstraat",
"Breda, Brabantplein"
"Breda, Mgr. de Vetstraat",
"Breda, Epelenberg",
"Breda, Giraffestraat",
"Breda, Tilburgseweg",
"Breda, Bergschot",
"Breda, Heerbaan"
]
,
["Breda, Centraal Station",
"Breda, Sophiastraat",
"Breda, Nassausingel",
"Breda, St.Ignatiusstraat"]
,

["Breda, Centraal Station",
"Breda, Vlaszak",
"Breda, Claudius Prinsenlaan",
"Breda, Lovensdijkstraat",
"Breda, Hogeschoollaan",
"Breda, Amphia Zkh. Molengracht",
"Breda, Bijster",
"Breda, Loevesteinstraat"]
,

[
"Roosendaal, Damastberg",
"Roosendaal, Fazantberg",
"Roosendaal, Jasmijnberg",
"Roosendaal, Thorbeckelaan",
"Roosendaal, Lelieberg",
"Roosendaal, Heulberg",
"Roosendaal, Tolberg Centrum",
"Roosendaal, Willem Dreesweg",
"Roosendaal, Heerma van Vossstraat",
"Roosendaal, Morelberg",
"Roosendaal, Bravis Ziekenhuis",
"Roosendaal, Boerhaavelaan",
"Roosendaal, Burg. Freijterslaan",
"Roosendaal, Roselaar",
"Roosendaal, Station",
"Roosendaal, Brugstraat",
"Roosendaal, Van Beethovenlaan",
"Roosendaal, Prof. Aalbersestraat",
"Roosendaal, Strausslaan",
"Roosendaal, Takspui",
"Roosendaal, Basaltdijk",
"Roosendaal, Bergkristaldijk",
"Roosendaal, Diamantdijk",
"Roosendaal, Dolomietdijk",
"Roosendaal, Koraaldijk",
"Roosendaal, Lavadijk",
"Roosendaal, Flintdijk",
"Roosendaal, Onyxdijk",
"Roosendaal, Sterrebos"]
,
[
"Roosendaal, Odiliadonk",
"Roosendaal, Jurriaandonk",
"Roosendaal, Martinusdonk",
"Roosendaal, Stephanusdonk",
"Roosendaal, Bovendonk",
"Roosendaal, A-Dijk",
"Roosendaal, Basaltdijk",
"Roosendaal, Takspui",
"Roosendaal, Strausslaan",
"Roosendaal, Prof. Aalbersestraat",
"Roosendaal, Van Beethovenlaan",
"Roosendaal, Brugstraat",
"Roosendaal, Station",
"Roosendaal, Roselaar",
"Roosendaal, Bergrand",
"Roosendaal, Champetterberg",
"Roosendaal, Chaletberg",
"Roosendaal, Bandeliersberg",
"Roosendaal, Ambrozijnberg",
"Roosendaal, Amberberg",
"Roosendaal, Damastberg"]
,
[
"Roosendaal, Sterrebos",
"Roosendaal, ‘t Zand",
"Roosendaal, Elisadonk",
"Roosendaal, Lindenburg",
"Roosendaal, Burg. Schneiderlaan",
"Roosendaal, Boulevard",
"Roosendaal, Brugstraat",
"Roosendaal, Station",
"Roosendaal, Roselaar",
"Roosendaal, Kroevenlaan",
"Roosendaal, Staringlaan",
"Roosendaal, Beetslaan",
"Roosendaal, President Kennedylaan",
"Roosendaal, Benedendonk",
"Roosendaal, Odiliadonk"]
,
[
"Roosendaal, Station",
"Roosendaal, Waterstraat",
"Roosendaal, Spoorstraat",
"Roosendaal, WVS Bosstraat",
"Roosendaal, WVS Vaartveld",
"Roosendaal, Spoorstraat",
"Roosendaal, Waterstraat",
"Roosendaal, Station"]
,
[
"Roosendaal, Station",
"Roosendaal, Roselaar",
"Roosendaal, Kroevenlaan",
"Roosendaal, Staringlaan",
"Roosendaal, Beetslaan",
"Roosendaal, President Kennedylaan",
"Roosendaal, Benedendonk"]

]

line_stop_dict = {line_name:line_stop for (line_name,line_stop) in zip(line_names,line_stops)} 

for key in line_stop_dict.keys():
    line_stop_dict[key] = {stop.lower():num for (num,stop) in enumerate(line_stop_dict[key])}
    
    
cleaned_9292 = pd.read_pickle('final_9292')
cleaned_arriva = pd.read_pickle('final_arriva')

#occupancy_df_arriva = get_occupancy_df(['4'],['original'],line_stop_dict,cleaned_arriva[cleaned_arriva['Line']==4])
#occupancy_df_9292 = get_occupancy_df(['4'],['original'],line_stop_dict,cleaned_9292[(cleaned_9292['Line']==4)&(cleaned_9292['Modaliteitsnummer']==2)])
# occupancy_df_arriva.to_pickle('occupancy_arriva')
# occupancy_df_9292.to_pickle('occupancy_9292')
occupancy_df_arriva = pd.read_pickle('occupancy_arriva')
occupancy_df_9292 = pd.read_pickle('occupancy_9292')

merged_df = get_merged_df(occupancy_df_9292,occupancy_df_arriva)
x = merged_df['Occupancy_x'].values
y = merged_df['Occupancy_y'].values


mlp_params = {  
            'hidden_layer_sizes':[(100,),(1000,)],
            'activation': ['identity', 'logistic', 'relu', 'tanh'],
            # 'mlp__solver':'adam',
            'alpha':[0.1,0.01,0.001],
            #'learning_rate':['constant'],
            'learning_rate_init':[0.1,0.01],
            # 'mlp__power_t':0.5,  assert(solver='sgd' and learning_rate='inv_scaling)        
            'max_iter':[2000]
#             'tol':[0.0001],
             # 'mlp__verbose':False,
            # 'mlp__warm_start':False,
            # 'mlp__momentum':0.9,
            # 'mlp__nesterovs_momentum':True,
            # 'mlp__early_stopping':False,
            # 'mlp__validation_fraction':0.05,
#             'epsilon':1e-08,
#             'beta_1': 0.9,
#             'beta_2':0.999,
#             'epsilon':1e-8       
        }


regressors = get_candidate_models(mlp_params)
#dump(regressors['Default MLP'],'best_model')
best_model = load('best_model')



In [None]:
regression_plot(x,y,'9292 Request Count','Arriva Bus Occupancy',regressors=regressors,logy=True)

In [None]:
regression_scores(x,y,regressors)

In [None]:
get_occupancy_prediction(line='4',day='Monday',hour=0,source='breda, dreef',destination='breda, centrum',merged_df=merged_df,cleaned_9292=cleaned_9292,cleaned_arriva=cleaned_arriva,model=best_model)