In [1]:
## Load packages
import numpy as np
import pandas as pd
import pickle
import os

import statsmodels.api as sm
import matplotlib.pyplot as plt
plt.style.use('seaborn')

!pip install statsmodels pmdarima &> /dev/null
from pmdarima.arima import auto_arima
# from statsmodels.tsa.arima.model import ARIMA

In [2]:
os.chdir('/content/drive/MyDrive/personal/2022-ssc-case-study-2/data/')
print(os.getcwd())

## Load data
imputed_df = pd.read_csv("imputed_df.csv")
imputed_df.head(5)

/content/drive/MyDrive/personal/2022-ssc-case-study-2/data


Unnamed: 0,PRUID,CDUID,conn_type,SACTYPE,is_rural,date,avg_d_mbps,avg_u_mbps,imputed
0,10,1001,fixed,1,False,2019-01-01,110.345156,72.589665,0
1,10,1001,fixed,1,False,2019-04-01,119.111294,81.951347,0
2,10,1001,fixed,1,False,2019-07-01,119.28984,75.333758,0
3,10,1001,fixed,1,False,2019-10-01,133.98214,75.909098,0
4,10,1001,fixed,1,False,2020-01-01,138.116998,74.064663,0


In [3]:
## We are going to fit an ARIMA model for each Aggregated region.
## First, create DF with validation values (speeds in 2021-4Q):
arima_df = imputed_df[imputed_df['date']=='2021-10-01'].rename(columns={"avg_d_mbps":"d_val", "avg_u_mbps":"u_val"}).drop(columns=["imputed", "date"]).reset_index(drop=True)
arima_df['arima_d_order'] = str((0, 0, 0))
arima_df['arima_u_order'] = str((0, 0, 0))
arima_df['d_pred'] = 0
arima_df['u_pred'] = 0
arima_df['d_is_median'] = 0
arima_df['u_is_median'] = 0
arima_df.head(5)

Unnamed: 0,PRUID,CDUID,conn_type,SACTYPE,is_rural,d_val,u_val,arima_d_order,arima_u_order,d_pred,u_pred,d_is_median,u_is_median
0,10,1001,fixed,1,False,206.369578,102.525161,"(0, 0, 0)","(0, 0, 0)",0,0,0,0
1,10,1001,fixed,1,True,214.096638,110.849005,"(0, 0, 0)","(0, 0, 0)",0,0,0,0
2,10,1001,mobile,1,False,140.187612,24.954678,"(0, 0, 0)","(0, 0, 0)",0,0,0,0
3,10,1001,mobile,1,True,146.146273,17.026045,"(0, 0, 0)","(0, 0, 0)",0,0,0,0
4,10,1001,fixed,3,False,231.132254,113.201756,"(0, 0, 0)","(0, 0, 0)",0,0,0,0


In [4]:
arima_train_df = imputed_df[imputed_df['date'] != '2021-10-01'].drop(columns=["imputed"]).reset_index(drop=True)
arima_train_df.head(5)

Unnamed: 0,PRUID,CDUID,conn_type,SACTYPE,is_rural,date,avg_d_mbps,avg_u_mbps
0,10,1001,fixed,1,False,2019-01-01,110.345156,72.589665
1,10,1001,fixed,1,False,2019-04-01,119.111294,81.951347
2,10,1001,fixed,1,False,2019-07-01,119.28984,75.333758
3,10,1001,fixed,1,False,2019-10-01,133.98214,75.909098
4,10,1001,fixed,1,False,2020-01-01,138.116998,74.064663


In [25]:
pred.values[0]

188.1488456651437

In [30]:
# Fit ARIMA models using auto_arima for each Aggregated region.
# auto_arima tries to find the best arima models

from tqdm.auto import tqdm 

cols = ['PRUID', 'CDUID', 'conn_type', 'SACTYPE', 'is_rural']
speeds = ['avg_d_mbps', 'avg_u_mbps']

for i in tqdm(range(0, arima_df.shape[0])):
    pr, cd, ct, st, ir, *_ = arima_df.iloc[i].tolist()
    agg_row = [pr, cd, ct, st, ir]
    subset_idx = arima_train_df[cols].isin(agg_row).all(axis=1)
    
    for j in range(len(speeds)):    
        is_median = 0
        subset = arima_train_df.loc[subset_idx, speeds[j]].reset_index(drop=True)  # an ARIMA is trained on this set.
        
        ar_model = auto_arima(subset, start_p=1, start_q=1, max_p=3, max_q=3, start_P=0, seasonal=False,  
                                d=None, D=2, # d=None and D=None are the defaults.
                                information_criterion = 'bic', trace=False, error_action='ignore',
                                suppress_warnings=True) ## the order is stored as .order
        ar_model.fit(subset)
        
        subset_median = np.median(subset)
        
        try:
            # some predictions may return 0. In these cases, use the median instead.
            pred = np.max([ar_model.predict(n_periods = 1).values[0], subset_median]) 
            is_median = 1 if pred == subset_median else 0
        except ValueError:
            # ValueError occurs when there is only one observation so that ARIMA cannot be fit. Use median as pred here as well.
            pred = subset_median
            is_median = 1 

        if j == 0: # if download speed
            arima_df.loc[i, 'arima_d_order'] = str(ar_model.order) if is_median == 0 else "median"
            arima_df.loc[i, 'd_pred'] = pred
            arima_df.loc[i, 'd_is_median'] = is_median

        else: # if upload speed
            arima_df.loc[i, 'arima_u_order'] = str(ar_model.order) if is_median == 0 else "median"
            arima_df.loc[i, 'u_pred'] = pred
            arima_df.loc[i, 'u_is_median'] = is_median

arima_df.to_csv('arima_df.csv', index=False)

  0%|          | 0/2771 [00:00<?, ?it/s]

In [None]:
arima_df.to_csv('arima_df.csv', index=False)

print(np.sum(arima_df['d_is_median']))
print(np.sum(arima_df['u_is_median']))
print(arima_df.head(5))
print(arima_df.groupby(by=['arima_d_order']).size().to_frame(name='counts').reset_index())

# compute the performance using RMSE
from sklearn.metrics import mean_squared_error,r2_score
print(np.sqrt(mean_squared_error(arima_df['d_val'], arima_df['d_pred'])))
print(np.sqrt(mean_squared_error(arima_df['u_val'], arima_df['u_pred'])))

In [32]:
## Since most of the Aggregated regions fit best using (1,1,0), we choose this order to be our ARIMA model for all the Aggregated regions.
## At the same time, we want to make predictions for 2026-4Q and 2030-4Q.
## To do so, we make predictions for a total of 36 time points for each Aggregated region.

# Generate timepoint labels:
years = list(range(2022, 2031))
quarters = ['1Q', '2Q', '3Q', '4Q']
year_quarters = []

for y in years:
    for q in quarters:
        year_quarters.append(str(y) + "-" + q)

In [33]:
arima_df2 = pd.read_csv('arima_df.csv')
arima_d_df = arima_df2.loc[:, ['PRUID', 'CDUID', 'conn_type', 'SACTYPE', 'is_rural']]

for yq in year_quarters:
    arima_d_df[yq] = 0.0

arima_u_df = arima_d_df.copy()

In [34]:
# Fit ARIMA(1,1,0) and make forecasts accordingly. This time, we use the full set for fitting the ARIMAs.
arima_full_df = imputed_df.drop(columns=["imputed"]).reset_index(drop=True)

from statsmodels.tsa.arima.model import ARIMA 
from tqdm.auto import tqdm 

cols = ['PRUID', 'CDUID', 'conn_type', 'SACTYPE', 'is_rural']
speeds = ['avg_d_mbps', 'avg_u_mbps']

for i in tqdm(range(0, arima_df2.shape[0])):
    pr, cd, ct, st, ir, *_ = arima_df2.iloc[i].tolist()
    agg_row = [pr, cd, ct, st, ir]
    subset_idx = arima_full_df[cols].isin(agg_row).all(axis=1)
    
    for j in range(len(speeds)):    
        subset = arima_full_df.loc[subset_idx, speeds[j]].reset_index(drop=True)  # an ARIMA is trained on this set.
        
        fin_arima = sm.tsa.arima.ARIMA(subset, order=(1, 1, 0)).fit()

        if j == 0: 
            pred = fin_arima.forecast(36)
            arima_d_df.iloc[i,5:] = pred
            
        else:
            pred = fin_arima.forecast(36)
            arima_u_df.iloc[i,5:] = pred
            
arima_d_df.to_csv('arima_d_forecasted.csv', index=False)
arima_u_df.to_csv('arima_u_forecasted.csv', index=False)
print(arima_d_df.head())
print(arima_u_df.head())

  0%|          | 0/2771 [00:00<?, ?it/s]

  warn('Non-stationary starting autoregressive parameters'


   PRUID  CDUID conn_type  SACTYPE  is_rural     2022-1Q     2022-2Q  \
0     10   1001     fixed        1     False  214.080685  214.080697   
1     10   1001     fixed        1      True  217.128290  217.565748   
2     10   1001    mobile        1     False  142.995207  143.981154   
3     10   1001    mobile        1      True  142.026969  143.711916   
4     10   1001     fixed        3     False  223.198858  225.069466   

      2022-3Q     2022-4Q     2023-1Q  ...     2028-3Q     2028-4Q  \
0  214.080697  214.080697  214.080697  ...  214.080697  214.080697   
1  217.628872  217.637981  217.639295  ...  217.639517  217.639517   
2  143.672658  143.769184  143.738982  ...  143.746180  143.746180   
3  143.022711  143.304621  143.189309  ...  143.222784  143.222784   
4  224.628397  224.732396  224.707874  ...  224.712553  224.712553   

      2029-1Q     2029-2Q     2029-3Q     2029-4Q     2030-1Q     2030-2Q  \
0  214.080697  214.080697  214.080697  214.080697  214.080697  214.08