In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
import json
import os
import glob
import datetime
import warnings
warnings.filterwarnings('ignore')

np.random.seed(7)
from colorama import Fore
from lightgbm import LGBMRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from matplotlib import pyplot

In [2]:
weather_columns = ['date', 'avg_wind_speed', 'peak_gust_time', 'precipitation', 'snow', 'snow_depth', 'temp_avg', 'temp_max', 'temp_min', 'tot_sunshine',
                  'dir_fwind_2min', 'dir_fwind_5min', 'speed_fwind_2min', 'speed_fwind_5min', 'fog', 'heavy_fog', 'thunder', 'ice_pellets', 'hail', 'glaze', 'smoke']
traffic_columns = ['PUZone', 'Count', 'PUTime']

In [3]:
from datetime import datetime, timedelta

def datetime_range(end, delta, count):
        
        current = datetime.strptime(end, '%Y-%m-%d %H:%M:%S')
        for i in range(count):
            yield current
            current -= delta

In [4]:
from sklearn.model_selection import cross_validate
def evaluate(model, X, y, cv):
    cv_results = cross_validate(
        model,
        X,
        y,
        cv=cv,
        scoring=["neg_mean_absolute_error", "neg_root_mean_squared_error"],
    )
    mae = -cv_results["test_neg_mean_absolute_error"]
    rmse = -cv_results["test_neg_root_mean_squared_error"]
    print(
        "Cross validation Mean Absolute Error:",mae
    )


In [5]:
high_zone = [0] * 73
for h in [1, 15, 24, 25, 26, 30, 31, 37, 41, 43, 44, 46, 62, 69]:
    high_zone[h] = 1

In [6]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np
class Model():
    
    def __init__(self):
        self.random_forest = RandomForestRegressor(n_estimators=100)
        self.decision_tree = DecisionTreeRegressor()
        self.mlp = MLPRegressor(hidden_layer_sizes=(300,150), max_iter=200,activation ='relu',solver='adam',random_state=1)
        self.xgb = XGBRegressor(objective='reg:squarederror', n_estimators=1000)
        self.lgbm = LGBMRegressor(learning_rate = 0.01, num_iterations = 1000)
        self.svr = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))
#         self.ensemble = VotingRegressor([('lgbm', self.lgbm), ('xgb', self.xgb), ('svr', self.svr)])
        self.ensemble = VotingRegressor([('lgbm', self.lgbm), ('xgb', self.xgb), ('rf', self.random_forest), ('mlp', self.mlp)])
        self.count = 0
        self.to_keep = []
        self.weather_to_keep = []
        self.train_drop_list = ['Count(0)', 'date']
        self.model = self.ensemble
        return 
    
    def train_model(self, traffic, weather):
        df = self.preprocessing(traffic, weather)
        df.to_csv('new_merged.csv', index=False)
        
        train = df.loc[df.date < '2017-05-01']
        valid = df.loc[df.date >= '2017-05-01']
        
        ts_cv = TimeSeriesSplit(
            n_splits=5,
            gap=48,
            max_train_size=10000,
            test_size=1000,
        )
        
        train_y = train['Count(0)']
        
        train_x = train.drop(self.train_drop_list, axis=1)
       
        self.to_keep = train_x.columns
        
        valid_y = valid['Count(0)']
        
        valid_x = valid.drop(self.train_drop_list, axis=1)
       
        self.model.fit(train_x,train_y)
        pred_y = self.model.predict(valid_x)
        valid_y_array = valid_y.values.ravel()
        mae = mean_absolute_error(valid_y_array,pred_y)
        print('Validation MAE: ', mae)
        evaluate(self.model, df.drop(self.train_drop_list, axis=1), df['Count(0)'], cv=ts_cv)
    
    def preprocessing(self, traffic, weather):
        
        
        # extract date and hour
        
        traffic['PUTime'] = pd.to_datetime(traffic['PUTime'])
        traffic['date'] = traffic['PUTime'].dt.date
        traffic['hour'] = traffic['PUTime'].dt.hour
        traffic['weekday'] = (traffic['PUTime'].dt.dayofweek < 5).astype(int)
        
        traffic['peak_hour'] = (traffic['hour'] >= 16) * (traffic['hour'] <= 20) + (traffic['weekday'] == 1) * (traffic['hour'] >= 6) * (traffic['hour'] <= 10) 
        traffic['peak_hour'] = traffic['peak_hour'].astype(int)
        
        # group by zones
        grouped = traffic.groupby(traffic.PUZone)
        dfs = []
        for i in range(0,73):
            dfs.append(grouped.get_group(i))
        
        for zone in range(0,73) :
            dfs[zone].drop(['PUZone'], axis=1,inplace=True)
            
        
        
        # Preprocess weather
        weather.columns = weather_columns

        weather['date'] = pd.DatetimeIndex(weather['date']).date
        
        # handle null values
        weather = weather.dropna(how='all')
        weather = weather.drop(['peak_gust_time', 'temp_avg', 'tot_sunshine', 'thunder', 'ice_pellets', 'hail', 'glaze'], 1)
        
        ### replace with 0
        for col in weather.columns:
            weather[col] = weather[col].fillna(0)
        
                                
        weather = weather[['date']+self.weather_to_keep]
        
        X2 = []
        # add same hour for previous 30 days
        for zone in range(0,73) :
            dfs[zone]["PUTime"] = pd.to_datetime(dfs[zone]["PUTime"]) # Convert column type to be datetime
            indexed_df = dfs[zone].set_index(["PUTime"])           # Create a datetime index
            indexed_df.drop(['date'],axis=1,inplace=True)
            indexed_df.drop(['hour'],axis=1,inplace=True)
            indexed_df.drop(['weekday'], axis=1, inplace=True)
            indexed_df.drop(['peak_hour'], axis=1, inplace=True)
            
            
            n_steps_in = 30
            X_new = pd.DataFrame()
            for k in range(len(indexed_df.columns)) :
                for i in range(0, n_steps_in):
                    for j in range(-1, 1, 1):
                        X_new[indexed_df.columns[k] + '(' + str(-24*i+j) + ')'] = indexed_df.iloc[:,k].shift(24*i-j)
                    if i<=24:
                        X_new[indexed_df.columns[k] + '(' + str(-i) + ')'] = indexed_df.iloc[:,k].shift(i)
                X_new[indexed_df.columns[k] + '(' + str(-24*30) + ')'] = indexed_df.iloc[:,k].shift(24*30)
            X_new = X_new.iloc[:, ::-1]
            X_new['date'] = dfs[zone]['date'].values
            X_new['hour'] = dfs[zone]['hour'].values
            X_new['weekday'] = dfs[zone]['weekday'].values
            X_new['peak_hour'] = dfs[zone]['peak_hour'].values
            
            X2.append(X_new)
            
            
        for zone in range(0,73):
            X2[zone] = X2[zone].dropna()
            
        dfW2 = []
        for zone in range(0,73):
            X2[zone]['date'] = X2[zone]['date'].astype(str)
            weather['date'] = weather['date'].astype(str)
            dfW2.append(X2[zone].merge(weather, on='date'))
        
        
        for zone in range(0,73):
            dfW2[zone]['PUZone'] = zone
            dfW2[zone]['highZone'] = high_zone[zone]
            
        
        dfAll2 = pd.concat(dfW2, axis=0)
        return dfAll2

    def preprocess_test(self, demand, weather, dt, neighbour):
        
        df = pd.DataFrame(columns=self.to_keep)
        df['PUZone'] = [i for i in range(0, 73)]
        df['highZone'] = high_zone
        
        pred_date_time_obj = datetime.strptime(dt, '%Y-%m-%d %H:%M:%S')
        pred_date, pred_time = dt.split(" ")
        df['weekday'] = (pred_date_time_obj.weekday() < 5)
        df['weekday'] = df['weekday'].astype(int)
        df['hour'] = pred_date_time_obj.hour
        
        df['peak_hour'] = (df['hour'] >= 16) * (df['hour'] <= 20) + (df['weekday'] == 1) * (df['hour'] >= 6) * (df['hour'] <= 10)
        df['peak_hour'] = df['peak_hour'].astype(int)
        
        
        for i in range(1, 25):
            df['Count('+str(-i)+')'] = demand[-i]
        for i in range(1, 30):
            for j in range(-1, 1, 1):
                df['Count('+str(-i*24+j)+')'] = demand[-i*24+j]
        df['Count('+str(-30*24)+')'] = demand[-30*24]
        
        w = weather[-1]
        for i in range(len(weather_columns)):
            if weather_columns[i] in self.to_keep:
                df[weather_columns[i]] = [w[i]] * 73
                df[weather_columns[i]] = df[weather_columns[i]].interpolate()
        
        
        # fill nulls
        df = df.fillna(0)
        return df
    

    def predict(self, demand, weather, dt, neighbors):
        '''
        Parameters
        ----------
        demand: (24*30, 73) numpy array containing last 30 days' hourly demand data, e.g. demand[-1, 3] contains last hour's demand of zone 3
        weather: List of lists containing today's and last 30 days' weather data, e.g., weather[-1] is a list containing today's weather data with [DATE, AWND,...,WT08] as in weather.csv
        dt: date and time of the prediction e.g., "2017-06-01 00:00:00"
        neighbors - Dictionary containing the mapping between each zone and their list of neighbors in zone_neighbors.json

        Return
        ------
        predictions: List of 73 non-negative integers - your trip forecast for each zone in the next hour
        '''
        df = self.preprocess_test(demand, weather, dt, neighbors)
        pred_y = self.model.predict(df)
        
        self.count += 1
        if self.count % 100 == 0:
            print(self.count, ' times done.')
        
        return pred_y

In [7]:
traffic = pd.concat(map(pd.read_csv, ['data/2017-01_1H_zone.csv', 'data/2017-02_1H_zone.csv', 
                                     'data/2017-03_1H_zone.csv', 'data/2017-04_1H_zone.csv', 'data/2017-05_1H_zone.csv']))
traffic.drop(['Unnamed: 0'], axis=1,inplace=True)
        
weather = pd.read_csv('data/weather.csv')

In [8]:
import warnings
warnings.filterwarnings('ignore')

In [9]:
import test_pred
model = Model()
model.train_model(traffic, weather)

Validation MAE:  15.373802442663964
Cross validation Mean Absolute Error: [ 7.5738978   7.69735854 23.89183351 23.38049649 24.16663535]


In [10]:
mae, zone_mae = test_pred.eval(model)

100  times done.
200  times done.
300  times done.
400  times done.
500  times done.
600  times done.
700  times done.


In [11]:
print(mae)

14.634484584474082


In [12]:
print(zone_mae)

[22.7422724  23.90387987 16.99620626 16.92737547  4.00765012  0.65267874
  2.53241719  0.38824155  9.90684339  0.99662583 12.45302094  0.35914115
 15.32529766 16.97249328 21.25349087 33.04460538 12.83793238 14.63902905
  3.37287951 17.28139928 11.28314766  3.63200867 16.39136607  0.69178855
 42.74944985 34.44477168 28.65441255 23.08839764 10.2099262   1.34297258
 29.15768238 42.46980723  0.62468194  4.80721663 11.75106775  6.04583583
 18.15146915 33.34876327 19.37343275  4.46716964 17.15883881 28.7552377
  0.51528405 35.55857672 37.52654735 22.96310394 26.45701874  6.17130193
  1.39501973  3.76388391 21.07964747  1.39789367 22.57007629  9.1934046
 14.8564962  17.97650862  2.81654571  5.00682996  8.33122694  4.97045756
 25.47658177  4.78830579 32.85048165 14.35344286  1.83239714 23.88230384
  0.64598489  9.29398455 11.25488203 27.99719555 10.758097    7.52716387
 19.91380548]
