In [1]:
import numpy as np
import pandas as pd

def load_csv(file, group_columns = [], categorial_columns = [], meta_columns = []):
    data = pd.read_csv(file, sep=';')

    # Initial data-slicing
    data = data[(data.LinkTravelTime > 0) & (data.LineDirectionCode == 1)]

    # Data convertion
    data['DateTime'] = pd.to_datetime(data['DateTime'])
    time = pd.DatetimeIndex(data['DateTime']) 
    data['TimeOfDayClass'] = 'NO_PEEK' 
    data['Hour'] = time.hour
    data.ix[((7 < time.hour) & (time.hour < 9) & (data['DayType'] == 1)), 'TimeOfDayClass'] = 'PEEK' 
    data.ix[((15 < time.hour) & (time.hour < 17) & (data['DayType'] == 1)), 'TimeOfDayClass'] = 'PEEK' 
       
    numerical_columns = []
    
    output_column = 'LinkTravelTime'

    # Calculate m lag headway and travel time for same link, earlier journeys
    m = 3
    grouping = data.groupby(['LinkRef'])
    for i in range(1, m + 1):
        data['HeadwayTime_L' + str(i)] = (data['DateTime'] - grouping['DateTime'].shift(i)) / np.timedelta64(1, 's')
        data['LinkTravelTime_L' + str(i)] = grouping['LinkTravelTime'].shift(i)
        numerical_columns += ['HeadwayTime_L' + str(i), 'LinkTravelTime_L' + str(i)]

    # Slice out missing values
    for i in range(1, m + 1):
        data = data[(data['HeadwayTime_L' + str(i)] > 0) & (data['LinkTravelTime_L' + str(i)] > 0)]

    # Calculate j lag headway and travel time for journey, upstream links
    j = 3
    grouping = data.groupby(['JourneyRef'])
    for i in range(1, j + 1):
        data['LinkTravelTime_J' + str(i)] = grouping['LinkTravelTime'].shift(i)
        numerical_columns += ['LinkTravelTime_J' + str(i)]
    
    # Slice out missing values
    for i in range(1, j + 1):
        data = data[(data['LinkTravelTime_J' + str(i)] > 0)]

    data = data[(26 <= data.LineDirectionLinkOrder) & (data.LineDirectionLinkOrder <= 32)]

    print('Preprosessed data set size:', len(data))

    input_columns = categorial_columns + numerical_columns

    if len(group_columns) > 0:
        grouping = data.groupby(group_columns)
    else:
        grouping = [('all', data)]

    for key, group in grouping:
        with_dummies = pd.get_dummies(group[input_columns], columns = categorial_columns)
        
        # Create dummy variables
        X = with_dummies.as_matrix()
        Y = group.as_matrix(columns = [output_column])
        
        yield (key, X, Y, group[(meta_columns + input_columns + [output_column])])


In [2]:
from sklearn import linear_model
import sklearn.preprocessing as pp

# Configuration
group_columns = []
categorial_columns = ['LinkRef', 'DayType', 'TimeOfDayClass']
meta_columns = ['JourneyRef', 'DateTime', 'LineDirectionLinkOrder', 'LinkName']

results = pd.DataFrame()

# Load and pre-process data
data = load_csv('../data/4A_201701_Consistent.csv', group_columns = group_columns, categorial_columns = categorial_columns, meta_columns = meta_columns)

for group, X, Y, meta in data:
    
    # Split data into train and test    
    X_train, X_test = np.split(X, [int(.8*len(X))])
    Y_train, Y_test = np.split(Y, [int(.8*len(Y))])
    meta_train, meta_test = np.split(meta, [int(.8*len(meta))])
    print('Train data set (size, features):',  X_train.shape)

    clf = linear_model.LinearRegression()
    clf.fit(X_train, Y_train[:,0]) 

    Y_train_pred = clf.predict(X_train).reshape(-1, 1)
    
    # Test
    print('Group:', group, '\n\tTest data set (size, features):',  X_test.shape)

    Y_test_pred = clf.predict(X_test).reshape(-1, 1)

    meta_test['LinkTravelTime_Predicted'] = Y_test_pred
    results = results.append(meta_test, ignore_index = True)
    
    # Write predictions to CSV
    results.to_csv('../data/results_lr_single.csv', index = False, encoding = 'utf-8')
    
results.head()

('Preprosessed data set size:', 35767)
('Train data set (size, features):', (28613, 21))
('Group:', 'all', '\n\tTest data set (size, features):', (7154, 21))


Unnamed: 0,JourneyRef,DateTime,LineDirectionLinkOrder,LinkName,LinkRef,DayType,TimeOfDayClass,HeadwayTime_L1,LinkTravelTime_L1,HeadwayTime_L2,LinkTravelTime_L2,HeadwayTime_L3,LinkTravelTime_L3,LinkTravelTime_J1,LinkTravelTime_J2,LinkTravelTime_J3,LinkTravelTime,LinkTravelTime_Predicted
0,20170125L0004J0155,2017-01-25 17:57:02,28,Vestre Kirkegård Nord (2673) - Sankt Annæ Gymn...,200:2673->200:2675,1,NO_PEEK,352.0,61.0,336.0,64.0,853.0,60.0,74.0,44.0,77.0,69.0,64.288431
1,20170125L0004J0155,2017-01-25 17:58:20,29,Sankt Annæ Gymnasium (2675) - Sjælør St. (1188),200:2675->200:1188,1,NO_PEEK,357.0,49.0,341.0,49.0,841.0,63.0,69.0,74.0,44.0,53.0,50.916161
2,20170125L0004J0155,2017-01-25 17:59:56,30,Sjælør St. (1188) - Mozarts Plads (1190),200:1188->200:1190,1,NO_PEEK,330.0,90.0,320.0,93.0,816.0,98.0,53.0,69.0,74.0,77.0,90.905681
3,20170125L0004J0155,2017-01-25 18:01:44,31,Mozarts Plads (1190) - Bådehavnsgade (1192),200:1190->200:1192,1,NO_PEEK,335.0,103.0,332.0,76.0,777.0,147.0,77.0,53.0,69.0,108.0,133.355164
4,20170125L0004J0155,2017-01-25 18:02:10,32,Bådehavnsgade (1192) - Sluseholmen (1193),200:1192->200:1193,1,NO_PEEK,315.0,30.0,299.0,32.0,752.0,33.0,108.0,77.0,53.0,26.0,39.865362


In [3]:
# Configuration
group_columns = ['LinkRef']
categorial_columns = ['DayType', 'TimeOfDayClass']
meta_columns = ['JourneyRef', 'DateTime', 'LineDirectionLinkOrder', 'LinkName']

results = pd.DataFrame()

# Load and pre-process data
data = load_csv('../data/4A_201701_Consistent.csv', group_columns = group_columns, categorial_columns = categorial_columns, meta_columns = meta_columns)

for group, X, Y, meta in data:

    # Split data into train and test    
    X_train, X_test = np.split(X, [int(.8*len(X))])
    Y_train, Y_test = np.split(Y, [int(.8*len(Y))])
    meta_train, meta_test = np.split(meta, [int(.8*len(meta))])
    print('Train data set (size, features):',  X_train.shape)

    clf = linear_model.LinearRegression(copy_X = False, n_jobs = -1)
    clf.fit(X_train, Y_train[:,0]) 

    Y_train_pred = clf.predict(X_train).reshape(-1, 1)
    
    metric_train_mape = (np.abs(Y_train_pred - Y_train)/Y_train).mean()
    print('Train MAPE:', metric_train_mape)

    # Test
    print('Group:', group, '\n\tTest data set (size, features):',  X_test.shape)

    Y_test_pred = clf.predict(X_test).reshape(-1, 1)

    meta_test['LinkTravelTime_Predicted'] = Y_test_pred
    results = results.append(meta_test, ignore_index = True)
    
    # Write predictions to CSV
    results.to_csv('../data/results_lr_multiple.csv', index = False, encoding = 'utf-8')

results.head()

('Preprosessed data set size:', 35767)
('Train data set (size, features):', (4127, 14))
('Train MAPE:', 0.4424166816656761)
('Group:', '200:10427->200:1183', '\n\tTest data set (size, features):', (1032, 14))
('Train data set (size, features):', (4182, 14))
('Train MAPE:', 0.18507854530667586)
('Group:', '200:1183->200:2673', '\n\tTest data set (size, features):', (1046, 14))
('Train data set (size, features):', (3944, 14))
('Train MAPE:', 0.26344343406723975)
('Group:', '200:1188->200:1190', '\n\tTest data set (size, features):', (987, 14))
('Train data set (size, features):', (3989, 14))
('Train MAPE:', 0.65435879992459356)
('Group:', '200:1190->200:1192', '\n\tTest data set (size, features):', (998, 14))
('Train data set (size, features):', (4063, 14))
('Train MAPE:', 0.9587128705106619)
('Group:', '200:1192->200:1193', '\n\tTest data set (size, features):', (1016, 14))
('Train data set (size, features):', (4164, 14))
('Train MAPE:', 0.17204531885442667)
('Group:', '200:2673->200:26

Unnamed: 0,JourneyRef,DateTime,LineDirectionLinkOrder,LinkName,DayType,TimeOfDayClass,HeadwayTime_L1,LinkTravelTime_L1,HeadwayTime_L2,LinkTravelTime_L2,HeadwayTime_L3,LinkTravelTime_L3,LinkTravelTime_J1,LinkTravelTime_J2,LinkTravelTime_J3,LinkTravelTime,LinkTravelTime_Predicted
0,20170125L0004J0156,2017-01-25 18:00:32,26,Valby St. (10427) - Toftegårds Plads (1183),1,NO_PEEK,390.0,44.0,727.0,57.0,727.0,62.0,128.0,38.0,45.0,63.0,80.150743
1,20170125L0004J0157,2017-01-25 18:07:56,26,Valby St. (10427) - Toftegårds Plads (1183),1,NO_PEEK,444.0,63.0,834.0,44.0,1171.0,57.0,79.0,67.0,56.0,53.0,73.410321
2,20170125L0004J0158,2017-01-25 18:15:36,26,Valby St. (10427) - Toftegårds Plads (1183),1,NO_PEEK,460.0,53.0,904.0,63.0,1294.0,44.0,182.0,82.0,34.0,155.0,88.622119
3,20170125L0004J0160,2017-01-25 18:17:23,26,Valby St. (10427) - Toftegårds Plads (1183),1,NO_PEEK,129.0,46.0,107.0,155.0,567.0,53.0,79.0,53.0,44.0,80.0,80.085439
4,20170125L0004J0161,2017-01-25 18:28:22,26,Valby St. (10427) - Toftegårds Plads (1183),1,NO_PEEK,659.0,80.0,788.0,46.0,766.0,155.0,76.0,44.0,50.0,62.0,84.156519
