In [114]:
import tabpy_client

client = tabpy_client.Client('http://localhost:9004/')

In [115]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.tsa.stattools import adfuller
from sklearn.metrics import mean_squared_error
from math import sqrt


In [116]:
def prepare_2014_df():
    
    #Loading datasets
    uber_2014_apr=pd.read_csv(r'C:\Users\Selva\Downloads\uber-pickups-in-new-york-city\uber-raw-data-apr14.csv',header=0)
    uber_2014_may=pd.read_csv(r'C:\Users\Selva\Downloads\uber-pickups-in-new-york-city\uber-raw-data-may14.csv',header=0)
    uber_2014_jun=pd.read_csv(r'C:\Users\Selva\Downloads\uber-pickups-in-new-york-city\uber-raw-data-jun14.csv',header=0)
    uber_2014_jul=pd.read_csv(r'C:\Users\Selva\Downloads\uber-pickups-in-new-york-city\uber-raw-data-jul14.csv',header=0)
    uber_2014_aug=pd.read_csv(r'C:\Users\Selva\Downloads\uber-pickups-in-new-york-city\uber-raw-data-aug14.csv',header=0)
    
    #Merging
    df = uber_2014_apr.append([uber_2014_may,uber_2014_jun,uber_2014_jul,uber_2014_aug], ignore_index=True)
    
    #returning merged dataframe
    return df

#Uber 2014 dataset
uber_2014_master = prepare_2014_df()
uber_2014_master.head()


Unnamed: 0,Date/Time,Lat,Lon,Base
0,4/1/2014 0:11:00,40.769,-73.9549,B02512
1,4/1/2014 0:17:00,40.7267,-74.0345,B02512
2,4/1/2014 0:21:00,40.7316,-73.9873,B02512
3,4/1/2014 0:28:00,40.7588,-73.9776,B02512
4,4/1/2014 0:33:00,40.7594,-73.9722,B02512


In [117]:
def create_day_series(df):
    
    # Grouping by Date/Time to calculate number of trips
    day_df = pd.Series(df.groupby(['Date/Time']).size())
    # setting Date/Time as index
    day_df.index = pd.DatetimeIndex(day_df.index)
    # Resampling to daily trips
    day_df = day_df.resample('1D').apply(np.sum)
    
    return day_df

day_df_2014 = create_day_series(uber_2014_master)
day_df_2014.head()

Date/Time
2014-04-01    14546
2014-04-02    17474
2014-04-03    20701
2014-04-04    26714
2014-04-05    19521
Freq: D, dtype: int64

In [118]:
#Defining RMSE
def rmse(x,y):
    return sqrt(mean_squared_error(x,y))

#fitting ARIMA model on dataset
def SARIMAX_call(time_series,p_list,d_list,q_list,P_list,D_list,Q_list,s_list,test_period):    
    
    #Splitting into training and testing
    training_ts = time_series[:-test_period]
    
    testing_ts = time_series[len(time_series)-test_period:]
    
    error_table = pd.DataFrame(columns = ['p','d','q','P','D','Q','s','AIC','BIC','RMSE'],\
                                                           index = range(len(ns_ar)*len(ns_diff)*len(ns_ma)*len(s_ar)\
                                                                         *len(s_diff)*len(s_ma)*len(s_list)))
    count = 0
    for p in p_list:
        for d in d_list:
            for q in q_list:
                for P in P_list:
                    for D in D_list:
                        for Q in Q_list:
                            for s in s_list:
                                #fitting the model
                                SARIMAX_model = SARIMAX(training_ts.astype(float),\
                                                        order=(p,d,q),\
                                                        seasonal_order=(P,D,Q,s),\
                                                        enforce_invertibility=False)
                                SARIMAX_model_fit = SARIMAX_model.fit(disp=0)
                                AIC = np.round(SARIMAX_model_fit.aic,2)
                                BIC = np.round(SARIMAX_model_fit.bic,2)
                                predictions = SARIMAX_model_fit.forecast(steps=test_period,typ='levels')
                                RMSE = rmse(testing_ts.values,predictions.values)                                
                                #populating error table
                                error_table['p'][count] = p
                                error_table['d'][count] = d
                                error_table['q'][count] = q
                                error_table['P'][count] = P
                                error_table['D'][count] = D
                                error_table['Q'][count] = Q
                                error_table['s'][count] = s
                                error_table['AIC'][count] = AIC
                                error_table['BIC'][count] = BIC
                                error_table['RMSE'][count] = RMSE
                                
                                count+=1 #incrementing count        
    
    #returning the fitted model and values
    return error_table
ns_ar = [0,1,2]
ns_diff = [1]
ns_ma = [0,1,2]
s_ar = [0,1]
s_diff = [0,1] 
s_ma = [1,2]
s_list = [7]

error_table = SARIMAX_call(day_df_2014,ns_ar,ns_diff,ns_ma,s_ar,s_diff,s_ma,s_list,30)




In [119]:
# printing top 5 lowest RMSE from error table
error_table.sort_values(by='RMSE').head(5)

Unnamed: 0,p,d,q,P,D,Q,s,AIC,BIC,RMSE
37,1,1,1,1,0,2,7,2348.9,2365.73,1751.0
36,1,1,1,1,0,1,7,2345.86,2359.88,2177.15
69,2,1,2,1,0,2,7,2344.2,2366.63,2437.93
68,2,1,2,1,0,1,7,2341.01,2360.64,2985.87
61,2,1,1,1,0,2,7,2349.48,2369.11,3043.92


In [120]:
#Predicting values using the fitted model
def predict(time_series,p,d,q,P,D,Q,s,n_days,conf):
    
    #Splitting into training and testing
    training_ts = time_series[:-n_days]
    
    testing_ts = time_series[len(time_series)-n_days:]
    
    #fitting the model
    SARIMAX_model = SARIMAX(training_ts.astype(float),\
                            order=(p,d,q),\
                            seasonal_order=(P,D,Q,s),\
                            enforce_invertibility=False)
    SARIMAX_model_fit = SARIMAX_model.fit(disp=0)
    
    #Predicting
    SARIMAX_prediction = pd.DataFrame(SARIMAX_model_fit.forecast(steps=n_days,alpha=(1-conf)).values,\
                          columns=['Prediction'])
    SARIMAX_prediction.index = pd.date_range(training_ts.index.max()+1,periods=n_days)
    return list(SARIMAX_prediction['Prediction'])



In [121]:
prediction = predict(day_df_2014,0,1,0,0,1,2,7,5,0.80)
print(prediction)

[27194.372871561212, 29130.049721905645, 27968.055949968868, 23871.56036396292, 18899.43968309379]


In [122]:
uber_2014_aug=pd.read_csv(r'C:\Users\Selva\Downloads\uber-pickups-in-new-york-city\uber-raw-data-aug14.csv',header=0)
uber_2014_sep=pd.read_csv(r'C:\Users\Selva\Downloads\uber-pickups-in-new-york-city\uber-raw-data-sep14.csv',header=0)
uber_2014_aug=create_day_series(uber_2014_aug)
uber_2014_sep=create_day_series(uber_2014_sep)

In [124]:
def actual_values():
    return list(uber_2014_aug)+list(uber_2014_sep[:5])

In [165]:
pd.concat([pd.Series(range(1,37)),(pd.Series(list(uber_2014_aug)+prediction))],axis=1).to_csv(r'C:\Users\Selva\Downloads\arima_forecast.csv')

In [166]:
pd.concat([pd.Series(range(1,37)),pd.Series(actual_values())],axis=1).to_csv(r'C:\Users\Selva\Downloads\actuals.csv')