# Call Centre Department - Predictive Analysis

Analyzing through various concepts of forecasting for No. of Calls Offered. 

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('../../Data/Call Centre Dept/Call_Centre_CLEANED_Final.csv')

In [None]:
df

## Forecasting with Time Series Model for Calls Offered

### Disect data of Date and Calls Offered ONLY

In [None]:
call_off_df = df[['Date', 'Calls_Offered']]
call_off_df

In [None]:
#Check for null values
call_off_df.isnull().sum()

In [None]:
#Check info
call_off_df.info()

In [None]:
#test to fix SARIMA
call_off_df = call_off_df.replace(np.inf, np.nan).replace(-np.inf, np.nan).dropna()

In [None]:
#Change 'Date' column to datetime
call_off_df['Date'] = pd.to_datetime(call_off_df['Date'], infer_datetime_format=True)
#Set as index
call_off_df = call_off_df.set_index('Date')

In [None]:
#Select the proper time period for weekly aggregation
call_off_df = call_off_df['2021-01-01':'2021-12-31'].resample('W').sum()
call_off_df.head()

In [None]:
call_off_df.loc['2021-06-25':'2021-07-15']

In [None]:
#Groupby Date to show Year/Month --> Summarize Calls_Offered
#call_off_df_mth = call_off_df.groupby(call_off_df.Date.dt.to_period('M')).agg('sum')
#call_off_df_mth

In [None]:
#Alternative to find Date and Time

#Normalize Day of Month
#call_off_df['Year_Month'] = call_off_df['Date'] + pd.offsets.MonthEnd(-1) + pd.offsets.Day(1)

#Groupby by Month -> Summarize Calls Offered
#call_off_df.groupby('Year_Month')['Calls_Offered'].sum()

### Check for Common Time Series Patterns

In [None]:
import warnings
import matplotlib.pyplot as plt

In [None]:
#Visualize the Data
y = call_off_df['Calls_Offered']
fig, ax = plt.subplots(figsize=(20,6))

ax.plot(y, marker='.', linestyle='-', linewidth=0.5, label='Weekly')
ax.plot(y.resample('M').mean(), marker='o', markersize=8, linestyle='-', label='Monthly Mean Resample')
ax.set_ylabel('Calls Offered')
ax.legend()

### Decompose Data

In [None]:
import statsmodels.api as sm

In [None]:
#graphs to show seasonal_decompose 
#max period is round down(53/2) 
def seasonal_decompose(y):
    decomposition = sm.tsa.seasonal_decompose(y, model='additive', extrapolate_trend='freq', period=7)
    fig = decomposition.plot()
    fig.set_size_inches(14,7)
    plt.show()

seasonal_decompose(y)

### Check for Stationarity

In [None]:
#plot for Rolling Statistic to test Stationarity
def test_stationarity(timeseries, title):

    #Determine rolling statistics
    rolmean = pd.Series(timeseries).rolling(window=12).mean()
    rolstd = pd.Series(timeseries).rolling(window=12).std()
    
    fig, ax = plt.subplots(figsize=(16, 4))
    ax.plot(timeseries, label= title)
    ax.plot(rolmean, label='rolling mean')
    ax.plot(rolstd, label='rolling std (x10)')
    ax.legend()

In [None]:
pd.options.display.float_format = '{:.8f}'.format
test_stationarity(y, 'raw data')

Based on the above, it does not seems like it is stationary. However, let's look at performing another test of stationarity to further evaluate it. 

### Augmented Dickey-Fuller Test

In [None]:
from statsmodels.tsa.stattools import adfuller

In [None]:
#Augmented Dickey-Fuller Test
def ADF_test(timeseries, dataDesc):
    print(' > Is the {} stationary ?'.format(dataDesc))
    dftest = adfuller(timeseries.dropna(), autolag='AIC')
    print('Test statistic = {:.3f}'.format(dftest[0]))
    print('P-value = {:.3f}'.format(dftest[1]))
    print('Critical values :')
    for k, v in dftest[4].items():
        print('\t{}: {} - The data is {} stationary with {}% confidence'.format(k, v, 'not' if v<dftest[0] else '', 100-int(k[:-1])))

In [None]:
ADF_test(y, 'raw data')

As based on the 1%: Data is not stationary within 99 percent of the confidence interval. Hence, there is a need to stationarize the data. 

### Make the Data Stationary

#### Detrending

In [None]:
#detrend
y_detrend = (y - y.rolling(window=12).mean())/y.rolling(window=12).std()

test_stationarity(y_detrend,'de-trended data')
ADF_test(y_detrend,'de-trended data')

#### Differencing

In [None]:
#differencing
y_12lag = y - y.shift(12)

test_stationarity(y_12lag,'12 lag differenced data')
ADF_test(y_12lag,'12 lag differenced data')

As the differencing provides the best results, this will be the transformation that will be used. 

#### Combining Detrending and Differencing

In [None]:
#detrend + differencing
#y_12lag_detrend = y_detrend - y_detrend.shift(12)
#
#test_stationarity(y_12lag_detrend,'12 lag differenced de-trended data')
#ADF_test(y_12lag_detrend,'12 lag differenced de-trended data')

### Create Training & Testing Datasets

In [None]:
#set y_to_train, y_to_test, and the length of predict units
y_to_train = y[:'2021-06-30'] #dataset to train
y_to_test = y['2021-07-01':] #last X months for test

predict_date = len(y) - len(y[:'2021-06-30']) # the number of data points for the test set

### Simple Exponential Smoothing (SES)

Suitable for time series data without trend or seasonal components

In [None]:
import numpy as np
from statsmodels.tsa.api import SimpleExpSmoothing

In [None]:
def ses(y, y_to_train,y_to_test,smoothing_level,predict_date):
    y.plot(marker='o', color='black', legend=True, figsize=(14, 7))
    
    fit1 = SimpleExpSmoothing(y_to_train).fit(smoothing_level=smoothing_level,optimized=False)
    fcast1 = fit1.forecast(predict_date).rename(r'$\alpha={}$'.format(smoothing_level))
    # specific smoothing level
    fcast1.plot(marker='o', color='blue', legend=True)
    fit1.fittedvalues.plot(marker='o',  color='blue')
    mse1 = ((fcast1 - y_to_test) ** 2).mean()
    print('The Root Mean Squared Error of our forecasts with smoothing level of {} is {}'.format(smoothing_level,round(np.sqrt(mse1), 2)))
    
    ## auto optimization
    fit2 = SimpleExpSmoothing(y_to_train).fit()
    fcast2 = fit2.forecast(predict_date).rename(r'$\alpha=%s$'%fit2.model.params['smoothing_level'])
    # plot
    fcast2.plot(marker='o', color='green', legend=True)
    fit2.fittedvalues.plot(marker='o', color='green')
    
    mse2 = ((fcast2 - y_to_test) ** 2).mean()
    print('The Root Mean Squared Error of our forecasts with auto optimization is {}'.format(round(np.sqrt(mse2), 2)))
    
    plt.show()

In [None]:
ses(y, y_to_train,y_to_test,0.8,predict_date)

Based on the visualization results from SES, it is not ideal. 

### Holt's Linear Trend Method

Suitable for time series data with a trend component but without a seasonal component.

In [None]:
from statsmodels.tsa.api import Holt

In [None]:
#holt's linear trend
def holt(y,y_to_train,y_to_test,smoothing_level,smoothing_slope, predict_date):
    y.plot(marker='o', color='black', legend=True, figsize=(14, 7))
    
    fit1 = Holt(y_to_train).fit(smoothing_level, smoothing_slope, optimized=False)
    fcast1 = fit1.forecast(predict_date).rename("Holt's linear trend")
    mse1 = ((fcast1 - y_to_test) ** 2).mean()
    print('The Root Mean Squared Error of Holt''s Linear trend {}'.format(round(np.sqrt(mse1), 2)))

    fit2 = Holt(y_to_train, exponential=True).fit(smoothing_level, smoothing_slope, optimized=False)
    fcast2 = fit2.forecast(predict_date).rename("Exponential trend")
    mse2 = ((fcast2 - y_to_test) ** 2).mean()
    print('The Root Mean Squared Error of Holt''s Exponential trend {}'.format(round(np.sqrt(mse2), 2)))
    
    fit3 = Holt(y_to_train, damped_trend=True).fit(smoothing_level, smoothing_slope)
    fcast3 = fit3.forecast(predict_date).rename("Additive damped trend")
    mse3 = ((fcast3 - y_to_test) ** 2).mean()
    print('The Root Mean Squared Error of Holt''s Additive damped trend {}'.format(round(np.sqrt(mse2), 2)))

    fit1.fittedvalues.plot(marker="o", color='blue')
    fcast1.plot(color='blue', marker="o", legend=True)
    fit2.fittedvalues.plot(marker="o", color='red')
    fcast2.plot(color='red', marker="o", legend=True)
    fit3.fittedvalues.plot(marker="o", color='green')
    fcast3.plot(color='green', marker="o", legend=True)

    plt.show()

In [None]:
holt(y, y_to_train,y_to_test,0.6,0.2,predict_date)

### SARIMA

Suitable for time series data with trend and/or seasonal components.

In [None]:
import itertools

In [None]:
#sarima 
def sarima_grid_search(y,seasonal_period):
    p = d = q = range(0, 2)
    pdq = list(itertools.product(p, d, q))
    seasonal_pdq = [(x[0], x[1], x[2],seasonal_period) for x in list(itertools.product(p, d, q))]
    
    mini = float('+inf')
    
    
    for param in pdq:
        for param_seasonal in seasonal_pdq:
            try:
                mod = sm.tsa.statespace.SARIMAX(y,
                                                order=param,
                                                seasonal_order=param_seasonal,
                                                enforce_stationarity=False,
                                                enforce_invertibility=False)

                results = mod.fit()
                
                if results.aic < mini:
                    mini = results.aic
                    param_mini = param
                    param_seasonal_mini = param_seasonal

#                 print('SARIMA{}x{} - AIC:{}'.format(param, param_seasonal, results.aic))
            except:
                continue
    print('The set of parameters with the minimum AIC is: SARIMA{}x{} - AIC:{}'.format(param_mini, param_seasonal_mini, mini))

In [None]:
sarima_grid_search(y,12)

In [None]:
# Call this function after pick the right(p,d,q) for SARIMA based on AIC               
def sarima_eva(y,order,seasonal_order,seasonal_period,pred_date,y_to_test):
    # fit the model 
    mod = sm.tsa.statespace.SARIMAX(y,
                                order=order,
                                seasonal_order=seasonal_order,
                                enforce_stationarity=False,
                                enforce_invertibility=False)

    results = mod.fit()
    print(results.summary().tables[1])
    
    results.plot_diagnostics(figsize=(16, 8))
    plt.show()
    
    # The dynamic=False argument ensures that we produce one-step ahead forecasts, 
    # meaning that forecasts at each point are generated using the full history up to that point.
    pred = results.get_prediction(start=pd.to_datetime(pred_date), dynamic=False)
    pred_ci = pred.conf_int()
    y_forecasted = pred.predicted_mean
    mse = ((y_forecasted - y_to_test) ** 2).mean()
    print('The Root Mean Squared Error of SARIMA with season_length={} and dynamic = False {}'.format(seasonal_period,round(np.sqrt(mse), 2)))

    ax = y.plot(label='observed')
    y_forecasted.plot(ax=ax, label='One-step ahead Forecast', alpha=.7, figsize=(14, 7))
    ax.fill_between(pred_ci.index,
                    pred_ci.iloc[:, 0],
                    pred_ci.iloc[:, 1], color='k', alpha=.2)

    ax.set_xlabel('Date')
    ax.set_ylabel('Calls Offered')
    plt.legend()
    plt.show()

    # A better representation of our true predictive power can be obtained using dynamic forecasts. 
    # In this case, we only use information from the time series up to a certain point, 
    # and after that, forecasts are generated using values from previous forecasted time points.
    pred_dynamic = results.get_prediction(start=pd.to_datetime(pred_date), dynamic=True, full_results=True)
    pred_dynamic_ci = pred_dynamic.conf_int()
    y_forecasted_dynamic = pred_dynamic.predicted_mean
    mse_dynamic = ((y_forecasted_dynamic - y_to_test) ** 2).mean()
    print('The Root Mean Squared Error of SARIMA with season_length={} and dynamic = True {}'.format(seasonal_period,round(np.sqrt(mse_dynamic), 2)))

    ax = y.plot(label='observed')
    y_forecasted_dynamic.plot(label='Dynamic Forecast', ax=ax,figsize=(14, 7))
    ax.fill_between(pred_dynamic_ci.index,
                    pred_dynamic_ci.iloc[:, 0],
                    pred_dynamic_ci.iloc[:, 1], color='k', alpha=.2)

    ax.set_xlabel('Date')
    ax.set_ylabel('Calls Offered')

    plt.legend()
    plt.show()
    
    return (results)

The set of parameters with the minimum AIC is: SARIMA(0, 0, 0)x(0, 1, 0, 52) - AIC:2.0

test w/ seasonal period 12: The set of parameters with the minimum AIC is: SARIMA(1, 1, 0)x(1, 1, 0, 12) - AIC:512.9419660646736

In [None]:
model = sarima_eva(y,(1,1,0),(1,1,0,12),12,'2021-07-04',y_to_test)

#### Making Predictions

In [None]:
def forecast(model,predict_steps,y):
    
    pred_uc = model.get_forecast(steps=predict_steps)

    #SARIMAXResults.conf_int, can change alpha,the default alpha = .05 returns a 95% confidence interval.
    pred_ci = pred_uc.conf_int()

    ax = y.plot(label='observed', figsize=(14, 7))
#     print(pred_uc.predicted_mean)
    pred_uc.predicted_mean.plot(ax=ax, label='Forecast')
    ax.fill_between(pred_ci.index,
                    pred_ci.iloc[:, 0],
                    pred_ci.iloc[:, 1], color='k', alpha=.25)
    ax.set_xlabel('Date')
    ax.set_ylabel(y.name)

    plt.legend()
    plt.show()
    
    # Produce the forcasted tables 
    pm = pred_uc.predicted_mean.reset_index()
    pm.columns = ['Date','Predicted_Mean']
    pci = pred_ci.reset_index()
    pci.columns = ['Date','Lower Bound','Upper Bound']
    final_table = pm.join(pci.set_index('Date'), on='Date')
    
    return (final_table)

In [None]:
final_table = forecast(model,52,y)
final_table.head()

#### Evaluating SARIMA with MAPE

For each predicted data point, the absolute difference from the corresponding test point was calculated, and then divided by the test point. The average percentage gives the MAPE.

In [None]:
mod = sm.tsa.statespace.SARIMAX(y,order = (1,1,0), 
                                seasonal_order= (1,1,0,12),
                                seasonal_period= 12)
results = mod.fit()
print(results.summary().tables[1])
    
results.plot_diagnostics(figsize=(16, 8))
#plt.show()

    # The dynamic=False argument ensures that we produce one-step ahead forecasts, 
    # meaning that forecasts at each point are generated using the full history up to that point.
pred = results.get_prediction(start=pd.to_datetime('2021-07-04'), dynamic=False)
pred_ci = pred.conf_int()
y_forecasted = pred.predicted_mean
y_forecasted


In [None]:
from statsmodels.tsa.stattools import acf

In [None]:
def forecast_accuracy(forecast, actual):
    mape = np.mean(np.abs(forecast - actual)/np.abs(actual))  # mean absolute percentage error
    me = np.mean(forecast - actual)             # ME
    mae = np.mean(np.abs(forecast - actual))    # mean absolute error
    mpe = np.mean((forecast - actual)/actual)   # MPE
    rmse = np.mean((forecast - actual)**2)**.5  # root mean square
    corr = np.corrcoef(forecast, actual)[0,1]   # corr
    mins = np.amin(np.hstack([forecast[:,None], actual[:,None]]), axis=1)
    maxs = np.amax(np.hstack([forecast[:,None], actual[:,None]]), axis=1)
    minmax = 1 - np.mean(mins/maxs)             # minmax
    acf1 = acf(forecast-y_to_test)[1]                      # ACF1
    return({'mape':mape, 'me':me, 'mae': mae, 
            'mpe': mpe, 'rmse':rmse, 'acf1':acf1,
            'corr':corr, 'minmax':minmax})

forecast_accuracy(y_forecasted, y_to_test.values)

Around 1.565% MAPE implies the model is about 97.75% accurate in predicting the next 28 observations.

### ARIMA

DOES NOT WORK

In [None]:
from statsmodels.tsa.arima_model import ARIMA
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

In [None]:
call_off_df.head()

plt.xlabel('Date')
plt.ylabel('Calls Offered')
plt.plot(call_off_df)

In [None]:
#check for rolling mean and std
rolling_mean = call_off_df.rolling(window = 12).mean()
rolling_std = call_off_df.rolling(window = 12).std()
plt.plot(call_off_df, color = 'blue', label = 'Original')
plt.plot(rolling_mean, color = 'red', label = 'Rolling Mean')
plt.plot(rolling_std, color = 'black', label = 'Rolling Std')
plt.legend(loc = 'best')
plt.title('Rolling Mean & Rolling Standard Deviation')
plt.show()

In [None]:
#Check for ADF and p-value
result = adfuller(call_off_df['Calls_Offered'])
print('ADF Statistic: {}'.format(result[0]))
print('p-value: {}'.format(result[1]))
print('Critical Values:')
for key, value in result[4].items():
    print('\t{}: {}'.format(key, value))

In [None]:
#As p-value is not <0.05 --> take log to lower the rate as rolling mean increases
call_off_df_log = np.log(call_off_df)
plt.plot(call_off_df_log)

In [None]:
#Check for stationarity
def get_stationarity(timeseries):
    
    # rolling statistics
    rolling_mean = timeseries.rolling(window=12).mean()
    rolling_std = timeseries.rolling(window=12).std()
    
    # rolling statistics plot
    original = plt.plot(timeseries, color='blue', label='Original')
    mean = plt.plot(rolling_mean, color='red', label='Rolling Mean')
    std = plt.plot(rolling_std, color='black', label='Rolling Std')
    plt.legend(loc='best')
    plt.title('Rolling Mean & Standard Deviation')
    plt.show(block=False)
    
    # Dickey–Fuller test:
    result = adfuller(timeseries['Calls_Offered'])
    print('ADF Statistic: {}'.format(result[0]))
    print('p-value: {}'.format(result[1]))
    print('Critical Values:')
    for key, value in result[4].items():
        print('\t{}: {}'.format(key, value))

In [None]:
#test with substracting rolling mean
rolling_mean = call_off_df_log.rolling(window=12).mean()
df_log_minus_mean = call_off_df_log - rolling_mean
df_log_minus_mean.dropna(inplace=True)
get_stationarity(df_log_minus_mean)

As p-value is below the threshold of 0.05 and ADF statistic is close to critical values, time series is stationary.

In [None]:
#test with applying exponential decay
rolling_mean_exp_decay = call_off_df_log.ewm(halflife=12, min_periods=0, adjust=True).mean()
df_log_exp_decay = call_off_df_log - rolling_mean_exp_decay
df_log_exp_decay.dropna(inplace=True)
get_stationarity(df_log_exp_decay)

It performed worse than substracting the rolling mean. 

In [None]:
#test with time shifting
call_off_df_log_shift = call_off_df_log - call_off_df_log.shift()
call_off_df_log_shift.dropna(inplace=True)
get_stationarity(call_off_df_log_shift)

It performed worse than subtracting the rolling mean. However, it is still more stationary than the original.

In [None]:
#fit to ARIMA - sample test with order = 1,1,1
decomposition = seasonal_decompose(call_off_df_log) 
model = ARIMA(call_off_df_log, order=(1,1,1))
results = model.fit(disp=-1)
plt.plot(call_off_df_log_shift)
plt.plot(results.fittedvalues, color='red')

In [None]:
#compare with original time series
predictions_ARIMA_diff = pd.Series(results.fittedvalues, copy=True)
predictions_ARIMA_diff_cumsum = predictions_ARIMA_diff.cumsum()
predictions_ARIMA_log = pd.Series(call_off_df_log['Calls_Offered'].iloc[0], index=call_off_df_log.index)
predictions_ARIMA_log = predictions_ARIMA_log.add(predictions_ARIMA_diff_cumsum, fill_value=0)
predictions_ARIMA = np.exp(predictions_ARIMA_log)
plt.plot(df)
plt.plot(predictions_ARIMA)

In [None]:
#test with auto arima
import pmdarima as pm
from pmdarima.arima import auto_arima, ADFTest

In [None]:
aft_test = ADFTest(alpha=0.05)
aft_test.should_diff(call_off_df)

In [None]:
train = y_to_train
test = y_to_test
plt.plot(train)
plt.plot(test)

In [None]:
train = train.replace(np.inf, np.nan).replace(-np.inf, np.nan).dropna()

In [None]:
arima_model = auto_arima(train.dropna(), start_p=0, d=1, start_q=0,
                        max_p=5, max_d=5, max_q=5, start_P=0,
                        D=1, start_Q=0, max_P=5, max_D=5,
                        max_Q=5, m=12, seasonal=True,
                        error_action='warn', trace=True,
                        suppress_warnings=True, stepwise=True,
                        random_state=20, n_fits=50)

In [None]:
arima_model.summary()

In [None]:
#test with auto_arima
model_test = pm.auto_arima(train, seasonal=True, m=12)
forecasts = model_test.predict(test.shape[0])

x = np.arange(y.shape[0])
plt.plot(x[:26], train, c='blue')
plt.plot(x[26:], test, c='green')
plt.plot(x[26:], forecasts, c='red')
plt.show()

In [None]:
prediction = pd.DataFrame(arima_model.predict(n_periods=20), index=test.index)
prediction.columns = ['predicted_calls_offered']
prediction