In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt
from statsmodels.tsa.seasonal import seasonal_decompose
import statsmodels.api as sm
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.tsa.stattools import adfuller
from matplotlib.pylab import rcParams
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler


from keras.models import Sequential
from keras.layers import Dense
import keras.backend as K
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam
from keras.models import load_model

you can download the dataset here  : [dataset](https://drive.google.com/file/d/1emopjfEkTt59jJoBH9L9bSdmlDC4AR87/view?usp=sharing)

In [None]:
april = pd.read_csv('../input/uber-pickups-in-new-york-city/uber-raw-data-apr14.csv')
may   = pd.read_csv('../input/uber-pickups-in-new-york-city/uber-raw-data-may14.csv')
june  = pd.read_csv('../input/uber-pickups-in-new-york-city/uber-raw-data-jun14.csv')
july  = pd.read_csv('../input/uber-pickups-in-new-york-city/uber-raw-data-jul14.csv')
aug   = pd.read_csv('../input/uber-pickups-in-new-york-city/uber-raw-data-aug14.csv')
sept  = pd.read_csv('../input/uber-pickups-in-new-york-city/uber-raw-data-sep14.csv')

In [None]:
april.shape

In [None]:
data =  april.append(may).append(june).append(july).append(aug).append(sept)

In [None]:
data

In [None]:

data.Timestamp = pd.to_datetime(data['Date/Time'],format='%m/%d/%Y %H:%M:%S')

In [None]:
data['date_only'] = data.Timestamp.dt.date
data['date'] = data.Timestamp
data['month'] = data.Timestamp.dt.month
data['dow_num'] = data.Timestamp.dt.dayofweek
data['dow_name'] = data.Timestamp.dt.day_name()
data['month_day_num'] = data.Timestamp.dt.day
data['hours'] = data.Timestamp.dt.hour


In [None]:
data

In [None]:
data['Base'].value_counts()

In [None]:
data.groupby('hours')['hours'].count().sort_values(ascending=False)


In [None]:
## peak days

data.groupby(pd.Grouper(key='dow_name')).count()

uber_weekdays = data.pivot_table(index=['dow_num','dow_name'],values='Base', aggfunc='count')
uber_weekdays.plot(kind='bar', figsize=(15,8))
plt.ylabel('Total Trips')
plt.xlabel('Day')
plt.title('Trips by Week Day');


we can see ,there are more trips on thursdays and fridays

In [None]:
## peak hours 

uber_hour = data.pivot_table(index=['hours'], values='Base', aggfunc='count')
uber_hour.plot(kind='bar', figsize=(8,6))
plt.ylabel('Total Trips')
plt.title('Trips by Hour');

we can observe that more people take ride in the evening around 5pm

In [None]:
data.groupby(pd.Grouper(key='Base')).count()

uber_monthdays = data.pivot_table(index=['Base'], values='date' ,
                                  aggfunc='count')
uber_monthdays.plot(kind='bar', figsize=(8,6))
plt.ylabel('Total Trips')
plt.title('Trips by Month Day');

## Splitting data

In [None]:
data.drop(['Lat','Lon'],axis=1,inplace=True)

In [None]:
data

In [None]:
x = data.groupby('date_only').count()

In [None]:
x_tsf = x.copy()

all the columns have same value , because we jut counted number of trips for day . so the quantities are same for all <br/>
Let's delete all the columns keeping one

In [None]:
x_tsf.drop(['Date/Time','Base','month','dow_num','dow_name','month_day_num','hours'],axis=1,inplace=True)

In [None]:
x_tsf

In [None]:
round(0.9*len(x_tsf))

In [None]:
train_ts = x[:][:165]                     #split is 90-10
test_ts = x[:][166:]
#test_ts_d = uber_dates_d[:][166:]

In [None]:
train_ts['date'].plot(kind='line',figsize=(15,8), title= 'Daily Trip', fontsize=12)
test_ts['date'].plot(figsize=(15,5), title= 'Daily Trip', fontsize=12)
plt.ylabel('Total Trips')
plt.xlabel('Month')
plt.show()

## Holt's winter seasonal method

In [None]:
hat_avg = test_ts.copy()
fit1 = ExponentialSmoothing(np.asarray(train_ts['date']) ,seasonal_periods=7 ,trend='add', seasonal='add',).fit()
hat_avg['Holt_Winter'] = fit1.forecast(len(test_ts))

In [None]:
plt.figure(figsize=(15,5))
train_ts['date'].plot(kind='line',figsize=(15,8),fontsize=12,label='train')
test_ts['date'].plot(figsize=(15,5),fontsize=12,label='test')
plt.plot(hat_avg['Holt_Winter'], label='Holt_Winter')
plt.legend(loc='best')
plt.ylabel('Total Trips')
plt.xlabel('Months')
plt.show()

In [None]:

plt.style.use('default')
plt.figure(figsize = (16,8))
sm.tsa.seasonal_decompose(train_ts['date'].values,freq=30).plot()
result = sm.tsa.stattools.adfuller(x_tsf['date'])
plt.show()

In [None]:
hat_avg_1 = test_ts.copy()

fit1 = Holt(np.asarray(train_ts['date'])).fit(smoothing_level = 0.3,smoothing_slope = 0.1)
hat_avg_1['Holt_linear'] = fit1.forecast(len(test_ts))

In [None]:
plt.figure(figsize=(16,5))
train_ts['date'].plot(kind='line',figsize=(15,8),fontsize=12,label='train')
test_ts['date'].plot(figsize=(15,5),fontsize=12,label='test')
plt.plot(hat_avg_1['Holt_linear'], label='Holt_linear')
plt.legend(loc='best')
plt.show()


##  ARIMA

In [None]:
rcParams['figure.figsize']=(20,10)
rolmean = x_tsf['date'].rolling(24).mean()
rolstd = x_tsf['date'].rolling(24).std()
        
#Plot rolling Statistics
x_tsf['date'].plot(kind='line', color = "blue", label = "Actual")
rolmean.plot(kind='line', color = "brown", label = "Rolling Mean")
#.plot(kind='line', color = "black", label = "Rolling Std")
plt.legend(loc = "best")
plt.title("Rolling Mean and Standard Deviation")
plt.show(block = False)

In [None]:

Train_log = np.log(train_ts['date'])
valid_log = np.log(test_ts['date'])

In [None]:
moving_avg = Train_log.rolling(24).mean()
Train_log.plot(kind='line',figsize=(15,8),fontsize=12, color = 'green', label='Training_log')
moving_avg.plot(figsize=(15,5),fontsize=12, color = 'blue', label='Moving_avg')


In [None]:

train_log_moving_diff = Train_log - moving_avg
train_log_moving_diff.dropna(inplace = True)

In [None]:

rolmean = train_log_moving_diff.rolling(24).mean()
rolstd = train_log_moving_diff.rolling(24).std()

#Plot rolling Statistics
train_log_moving_diff.plot(kind='line', color = "blue", label = "Actual")
rolmean.plot(kind='line', color = "brown", label = "Rolling Mean")
rolstd.plot(kind='line', color = "black", label = "Rolling Std")
plt.legend(loc = "best")
plt.title("Rolling Mean and Standard Deviation")
plt.show(block = False)

In [None]:
train_log_diff = Train_log - Train_log.shift(1)


rolmean = train_log_diff.rolling(24).mean()
rolstd = train_log_diff.rolling(24).std()

#Plot rolling Statistics
train_log_diff.plot(kind='line', color = "blue", label = "Actual")
rolmean.plot(kind='line', color = "brown", label = "Rolling Mean")
rolstd.plot(kind='line', color = "black", label = "Rolling Std")
plt.legend(loc = "best")
plt.title("Rolling Mean and Standard Deviation")
plt.show(block = False)

In [None]:
decomposition = seasonal_decompose(pd.DataFrame(Train_log)['date'].values, freq = 24)
plt.style.use('default')
trend = decomposition.trend
seasonal = decomposition.seasonal
residual = decomposition.resid

In [None]:
plt.figure(figsize = (10,3))
Train_log.plot(kind='line', label = 'Original')
plt.legend(loc = 'best')

In [None]:
plt.figure(figsize = (10,9))
plt.subplot(411)
plt.plot(trend, label = 'Trend')
plt.legend(loc = 'best')
plt.subplot(412)
plt.plot(seasonal, label = 'Seasonal')
plt.legend(loc = 'best')
plt.subplot(413)
plt.plot(residual, label = 'Residuals')
plt.legend(loc = 'best')
plt.tight_layout()

In [None]:
train_log_decompose = pd.DataFrame(residual)
train_log_decompose['date'] = Train_log.index
train_log_decompose.set_index('date', inplace = True)
train_log_decompose.dropna(inplace = True)

In [None]:
rolmean = train_log_decompose[0].rolling(24).mean()
rolstd = train_log_decompose[0].rolling(24).std()

#Plot rolling Statistics
train_log_decompose[0].plot(kind='line', color = "blue", label = "Actual")
rolmean.plot(kind='line', color = "brown", label = "Rolling Mean")
rolstd.plot(kind='line', color = "black", label = "Rolling Std")
plt.legend(loc = "best")
plt.title("Rolling Mean and Standard Deviation")
plt.show(block = False)


In [None]:
from statsmodels.tsa.stattools import acf, pacf

lag_acf = acf(train_log_diff.dropna(), nlags = 25)
lag_pacf = pacf(train_log_diff.dropna(), nlags = 25, method= "ols")

In [None]:
# ACF
plt.figure(figsize = (15,5))
plt.style.use("fivethirtyeight")
plt.plot(lag_acf)
plt.axhline( y = 0, linestyle = "--", color = "gray")
plt.axhline( y= -1.96/np.sqrt(len(train_log_diff.dropna())), linestyle = "--", color = "gray")
plt.axhline(y = 1.96 /np.sqrt(len(train_log_diff.dropna())), linestyle = "--", color = "gray")
plt.title("Autocorrelation Function")
plt.show()

In [None]:
# PACF
plt.figure(figsize = (15,5))
plt.plot(lag_pacf)
plt.axhline(y = 0, linestyle = "--", color = "gray")
plt.axhline(y = -1.96/np.sqrt(len(train_log_diff.dropna())), linestyle = "--", color = "gray")
plt.axhline( y = 1.96/np.sqrt(len(train_log_diff.dropna())), linestyle = "--", color = "gray")
plt.title("Partial Autocorrelation Function")
plt.show()


## AR Model

In [None]:

plt.figure(figsize = (15,6))
model = ARIMA(Train_log, order = (2,1,0))  #here q value is zero since it is just AR Model
results_AR = model.fit(disp=-1)
train_log_diff.dropna().plot(kind='line', label = "Actual")
results_AR.fittedvalues.plot(kind='line', color = 'red', label = 'Predictions')
plt.legend(loc = 'upper right')

In [None]:

plt.figure(figsize = (16,8))
model = ARIMA(Train_log, order = (2,1,1))
results_ARIMA = model.fit(disp=-1)
train_log_diff.dropna().plot(kind='line',  label='Original')
results_ARIMA.fittedvalues.plot(kind='line', color='red', label='Predicted')
plt.legend(loc='best')
plt.show()

 ### to scale into original scale

In [None]:

def check_prediction_diff(predict_diff, given_set):
    predict_diff= predict_diff.cumsum().shift().fillna(0)
    predict_base = pd.Series(np.ones(given_set.shape[0]) * np.log(given_set['date'])[0], index = given_set.index)
    #predict_log = predict_base.add(predict_diff,fill_value=0)
    predict = np.exp(predict_base)
    
    plt.plot(given_set['date'], label = "Given set")
    plt.plot(predict, color = 'red', label = "Predict")
    plt.legend(loc= 'best')
    plt.title('RMSE: %.4f'% (np.sqrt(np.dot(predict, given_set['date']))/given_set.shape[0]))
    plt.show()

In [None]:
def check_prediction_log(predict_log, given_set):
    predict = np.exp(predict_log)
    
    plt.plot(given_set['date'], label = "Given set")
    plt.plot(predict, color = 'red', label = "Predict")
    plt.legend(loc= 'best')
    plt.title('RMSE: %.4f'% (np.sqrt(np.dot(predict, given_set['date']))/given_set.shape[0]))
    plt.show()

In [None]:
ARIMA_predict_diff=results_ARIMA.predict(len(train_ts))
plt.figure(figsize = (16,8))
check_prediction_diff(ARIMA_predict_diff, test_ts)

RMSE more, should go for any other model 

## Exponential smoothening

In [None]:
hat_avg = test_ts.copy()
fit2 = SimpleExpSmoothing(np.asarray(train_ts['date'])).fit(smoothing_level = 0.7,optimized = False)
hat_avg['SES'] = fit2.forecast(len(test_ts))
plt.figure(figsize =(15,8))
train_ts['date'].plot(kind='line',figsize=(15,8), label = 'Train')
test_ts['date'].plot(kind='line', label = 'Validation')
plt.plot(hat_avg['SES'], label = 'Simple Exponential Smoothing',color='green')
plt.legend(loc = 'best')

In [None]:
hat=hat_avg['SES'].values.tolist()
rmse = np.sqrt(mean_squared_error(test_ts['date'],hat))
rmse

better than AR model

## MA forecast with 10 observations

In [None]:

hat_avg = test_ts.copy()
hat_avg['moving_average_forecast'] = train_ts['date'].rolling(10).mean().iloc[-1]
plt.figure(figsize = (15,5))
train_ts['date'].plot(kind='line',figsize=(15,8), label = 'Train')
test_ts['date'].plot(kind='line', label = 'Validation')
plt.plot(hat_avg['moving_average_forecast'], label = 'Moving Average Forecast with 10 Observations')
plt.legend(loc = 'best')
plt.show()

## MA forecast with 20 observations

In [None]:
hat_avg = test_ts.copy()
hat_avg['moving_average_forecast'] = train_ts['date'].rolling(20).mean().iloc[-1]
plt.figure(figsize = (15,5))
train_ts['date'].plot(kind='line',figsize=(15,8), label = 'Train')
test_ts['date'].plot(kind='line', label = 'Validation')
plt.plot(hat_avg['moving_average_forecast'], label = 'Moving Average Forecast with 10 Observations')
plt.legend(loc = 'best')
plt.show()

In [None]:

rmse = np.sqrt(mean_squared_error(test_ts['date'], hat_avg['moving_average_forecast']))
rmse


In [None]:
hat = test_ts.copy()
fit2 = SimpleExpSmoothing(np.asarray(train_ts['date'])).fit(smoothing_level = 0.8,optimized = False)
hat['SES'] = fit2.forecast(len(test_ts))
plt.figure(figsize =(15,8))
train_ts['date'].plot(kind='line',figsize=(15,8), label = 'Train')
test_ts['date'].plot(kind='line', label = 'Validation')
plt.plot(hat['SES'], label = 'Simple Exponential Smoothing')
plt.legend(loc = 'best')

error is till high.. let's see other model

## SARIMAX

In [None]:
def day_series_creator(dataframe):
    
    # Grouping by Date/Time to calculate number of trips
    day_df = pd.Series(dataframe.groupby(['date']).size())
    # setting Date/Time as index
    day_df.index = pd.DatetimeIndex(day_df.index)
    # Resampling to daily trips
    day_df = day_df.resample('1D').apply(np.sum)
    
    return day_df

In [None]:
day_df_2014 = day_series_creator(data)
day_df_2014.head()


In [None]:
def initial_plots(time_series, num_lag):

    #Original timeseries plot
    plt.figure(1)
    plt.plot(time_series)
    plt.title('Original Uber data across time')
    plt.figure(2)
    plot_acf(time_series, lags = num_lag)
    plt.title('Autocorrelation plot')
    plot_pacf(time_series, lags = num_lag)
    plt.title('Partial autocorrelation plot')
    
    plt.show()

    
#Augmented Dickey-Fuller test for stationarity
#checking p-value
print('p-value: {}'.format(adfuller(day_df_2014)[1]))

In [None]:

#plotting
initial_plots(day_df_2014, 45)

In [None]:
#plotting 30 observation
initial_plots(day_df_2014, 30)

In [None]:
diff_series = day_df_2014.diff(periods=1)

#Augmented Dickey-Fuller test for stationarity
#checking p-value
print('p-value: {}'.format(adfuller(diff_series.dropna())[1]))

In [None]:
round(adfuller(diff_series.dropna())[1],2)

In [None]:

initial_plots(diff_series.dropna(), 30)

# ANN 

In [None]:
uber_count=data.groupby(pd.Grouper(key='date')).count()
print(uber_count.info())

In [None]:
uber_count.drop(['Base','date_only','month','dow_num','dow_name','month_day_num','hours'],axis=1,inplace=True)

In [None]:
uber_count

In [None]:
train = uber_count[:][:234084]             #90% of 260093
test = uber_count[:][234085:]

In [None]:
train['Date/Time'].plot(kind='area',figsize=(15,8), title= 'Hourly Trips', fontsize=14)
test['Date/Time'].plot(figsize=(15,5), title= 'Hourly Trips', fontsize=12)
plt.ylabel('Total Trips')
plt.xlabel('Month')
plt.show()

In [None]:

def test_stationary(timeseries):
    
    rolmean = timeseries.rolling(24).mean()
    rolstd = timeseries.rolling(24).std()
    
    
    #Plot rolling Statistics
    act = plt.plot(timeseries, color = "blue", label = "Actual")
    mean = plt.plot(rolmean, color = "brown", label = "Rolling Mean")
    std = plt.plot(rolstd, color = "black", label = "Rolling Std")
    plt.legend(loc = "best")
    plt.title("Rolling Mean and Standard Deviation")
    plt.show(block = False)

In [None]:
rcParams['figure.figsize']=(20,10)
test_stationary(uber_count['Date/Time'])

In [None]:
sc = MinMaxScaler()
train_sc = sc.fit_transform(train)
test_sc = sc.transform(test)

X_train = train_sc[:-1]
y_train = train_sc[1:]

X_test = test_sc[:-1]
y_test = test_sc[1:]

In [None]:

K.clear_session()

In [None]:
model = Sequential()
model.add(Dense(9, input_dim=1, activation='relu'))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
early_stop = EarlyStopping(monitor='loss', patience=5, verbose=1)
history = model.fit(X_train, y_train, epochs=20, batch_size=1, verbose=1, callbacks=[early_stop], shuffle=False)

In [None]:
y_pred_test_ann = model.predict(X_test)
y_train_pred_ann = model.predict(X_train)
rmse = np.sqrt(mean_squared_error(y_train,y_train_pred_ann))
print("Train : {:0.3f}".format(rmse))

rmse = np.sqrt(mean_squared_error(y_test,y_pred_test_ann))
print("Test : {:0.3f}".format(rmse))


In [None]:
y_pred_test_ANN = model.predict(X_test)
plt.plot(y_test, label='True')
plt.plot(y_pred_test_ANN, label='ANN')
plt.title("ANN's_Prediction")
plt.xlabel('Observation')
plt.ylabel('INR_Scaled')
plt.legend()
plt.show()

In [None]:
from sklearn import metrics
acc=metrics.r2_score(y_test,y_pred_test_ann)
print("Accuracy Score of Model: ",round(acc*100,2),'%')

In [None]:

score_ann= model.evaluate(X_test, y_test, batch_size=1)
print('ANN: %f'%score_ann)