In [1]:
# libraries
import pandas as pd 
import math
import numpy as np 
import matplotlib.pyplot as plt
import datetime
from sklearn import metrics
from statsmodels.tsa.api import ExponentialSmoothing,SimpleExpSmoothing, Holt
from statsmodels.tsa.forecasting.theta import ThetaModel
#%pip install pmdarima
#%pip install sktime
#%pip install ipynb

import warnings
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from math import sqrt
from statsmodels.tsa.ar_model import AutoReg
from statsmodels.tsa.arima_model import ARIMA
warnings.filterwarnings("ignore")
plt.style.use('fivethirtyeight')

In [2]:
# In order to use this notebook for univarate time series analysis :-
# 1) The primary requirement is not to have missing values or categorial(string) data for time_dependent variable 
#    and time_column.
# 2) This cell requires information on file_name (only csv), time_dependent_variable, time_column, date_time format (frmt)
#    and resample grain(X). After filling the required information correctly, you can run all the cells (Cell ---> Run All)
# 3) Example :-
#   file_name               = "JetRail Avg Hourly Traffic Data - 2012-2013.csv"
#   time_dependent_variable = "Count"    (column name in your dataset)
#   time_column             = "Datetime" (column name in your dataset)
#   frmt                    = "%Y-%m-%d"
#   X                       = "D" 


file_name = "JetRail Avg Hourly Traffic Data - 2012-2013.csv"
time_dependent_variable = "Count"
time_column = "Datetime"
frmt =  "%Y-%m-%d"
X = "D"
split = .9

In [3]:
def data_univariate():
    df = pd.read_csv(file_name, parse_dates= True)
    df = df[[time_column,time_dependent_variable]]
    df[time_column] = pd.to_datetime(df[time_column],format=frmt) 
    df.index = df[time_column]
    df = df.resample(X).mean()
    df.reset_index(inplace= True)
    return df

df = data_univariate()

In [4]:
def data_multivariate():
    df = pd.read_csv(file_name, parse_dates= True)
    df[time_column] = pd.to_datetime(df[time_column],format=frmt) 
    df.index = df[time_column]
    df = df.resample(X).mean()
    df.reset_index(inplace= True)
    return df


In [5]:
def resumetable(df):
    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    summary['Missing'] = df.isnull().sum().values
    summary['Missing(% of Total values)']= (100 * summary['Missing']) / df.shape[0]
    summary['Uniques'] = df.nunique().values
    summary['First Value'] = df.loc[0].values
    summary['Second Value'] = df.loc[1].values
    return summary


In [6]:
def describe(df):
    return df.describe().T

In [7]:
from sklearn import metrics

def timeseries_evaluation_metrics_func(y_true, y_pred):
    def mean_absolute_percentage_error(y_true, y_pred):
        y_true, y_pred = np.array(y_true), np.array(y_pred)
        return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    print('Evaluation metric results:-')
    print(f'MSE is : {metrics.mean_squared_error(y_true, y_pred)}')
    print(f'MAE is : {metrics.mean_absolute_error(y_true, y_pred)}')
    print(f'RMSE is : {np.sqrt(metrics.mean_squared_error(y_true, y_pred))}')
    print(f'MAPE is : {mean_absolute_percentage_error(y_true,y_pred)}')
    print(f'R2 is : {metrics.r2_score(y_true, y_pred)}',end='\n\n')

In [8]:
# This splits the data into train and test using split_date
def train_test_split_date(df, split_date):
    split_date = '2017-01-01'
    train = df.loc[df.index <= split_date].copy()
    test = df.loc[df.index > split_date].copy()
    return train, test

In [9]:
# This splits the data into train and test using default split_size = 0.7
def train_test_split_perc(df, split = .9):
    total_size=len(df)
    train_size=math.floor(split*total_size) #(70% Dataset)
    train = df.head(train_size)
    test  = df.tail(len(df) -train_size)
    return train, test
train, test =  train_test_split_perc(df, split = .9) 

y_hat= test.copy()


In [10]:
def plot(method):
    global y_hat
    plt.figure(figsize=(12,8))
    plt.plot(train[time_column], train[time_dependent_variable], label='Train')
    plt.plot(test[time_column],test[time_dependent_variable], label='Test')
    plt.plot(y_hat[time_column], y_hat[method], label= method +' forecast')
    plt.legend(loc='best')
    plt.title(method + ' forecast')
    plt.show()

In [11]:
def plot_with_metric(method):
    plot(method)
    timeseries_evaluation_metrics_func(y_hat[time_dependent_variable], y_hat[method])

In [12]:
def date_features(df):
    

    
    df['month'] = df[time_column].dt.month
    df['year'] = df[time_column].dt.year
    #df['dayofweek'] = df[time_column].dt.strftime('%A')
    df['week_day'] = df[time_column].dt.weekday
    df['quarter'] = df[time_column].dt.quarter
    df['dayofyear'] = df[time_column].dt.dayofyear
    df['dayofmonth'] = df[time_column].dt.day
    df['weekofyear'] = df[time_column].dt.weekofyear
    

    return df

In [13]:
def column_reorder(df):
    df = df[[ col for col in df.columns if col != time_dependent_variable] + [time_dependent_variable]]
    return df
