# TOWARDS A CLEAR UNDERSTANDING OF RURAL INTERNET: WHAT STATISTICAL MEASURES CAN BE USED TO ASSESS, COMPARE AND FORECAST INTERNET SPEEDS FOR RURAL CANADIAN COMMUNITIES?

The Government of Canada has committed to helping 95% of Canadian households and businesses access high-speed internet at minimum speeds of 50 Mbps download and 10 Mbps upload (hereinafter referred to as the “Commitment”) by 2026, and 100% by 2030. According to the CRTC, currently 45.6% of rural community households have access to the Commitment based on what’s available to them via an Internet Service Provider (e.g. Shaw, Telus, etc.) in their region, rather than what a rural household actually realizes at home in terms of internet speeds. 

https://ssc.ca/en/case-study/towards-a-clear-understanding-rural-internet-what-statistical-measures-can-be-used-assess

## Import libraried and read data

In [1]:
import sys
from google.colab import drive
drive.mount('/content/gdrive')
colab_dir = '/content/gdrive/My Drive/Colab_files/2022_CCS_case_study'
sys.path.append(colab_dir)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
!pip install pmdarima
from pmdarima.arima import auto_arima
from statsmodels.tsa.vector_ar.var_model import VAR
from statsmodels.tsa.arima.model import ARIMA 

In [3]:
%matplotlib inline
import tqdm
from datetime import datetime
import matplotlib
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
plt.style.use('seaborn')
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.metrics import mean_squared_error,r2_score
from statsmodels.tsa.arima.model import ARIMA 
import statsmodels.api as sm

In [4]:
#specify minimum or maximum possible values in a forecast
def f(x, a, b):
  x = np.array(x)
  return np.log((x-a)/(b-x))

def finv(x, a, b):
  x = np.array(x)
  return (b * np.exp(x) + a) / (np.exp(x) + 1)

x = [12, 14, 15]
MAX_D = 2000 # 2,000 Mb
MAX_U = 1000
finv(f(x, min(x)-(10e-3), MAX_D), min(x)-(10e-3), MAX_D)

array([12., 14., 15.])

In [10]:
df = pd.read_csv(colab_dir+'/canada_aggregation_CD_SACTYPE_isrural_conn_type_imputed_step3.csv')
train = pd.read_csv(colab_dir+'/canada_test_imp.csv')
display(train.head(2))
valid = pd.read_csv(colab_dir+'/canada_valid_imp.csv')
display(valid.head(2))
print(train.shape, valid.shape)

Unnamed: 0,date,PRUID,CDUID,SACTYPE,is_rural,conn_type,avg_d_mbps_wt,avg_u_mbps_wt,avg_lat_ms,tests,DA_POP,imputed
0,2019-01-01,10,1001,1,False,fixed,110.345156,72.589665,10.505591,6618,550285.0,False
1,2019-01-01,10,1001,1,False,mobile,92.795777,28.8362,65.107692,130,43085.0,False


Unnamed: 0,date,PRUID,CDUID,SACTYPE,is_rural,conn_type,avg_d_mbps_wt,avg_u_mbps_wt,avg_lat_ms,tests,DA_POP,imputed
0,2021-10-01,10,1001,1,False,fixed,206.369578,102.525161,11.942926,9584,640785.0,False
1,2021-10-01,10,1001,1,False,mobile,140.187612,24.954678,38.949468,376,111080.0,False


(30481, 12) (2771, 12)


# Make a prediction for all Census devision

In [6]:
n_models = train.groupby(['PRUID','CDUID','SACTYPE', 'is_rural','conn_type']).size().reset_index().rename(columns={0:'count'})
print(n_models.shape)
print(n_models['count'].value_counts())
n_models.head(2)

(2771, 6)
11    2771
Name: count, dtype: int64


Unnamed: 0,PRUID,CDUID,SACTYPE,is_rural,conn_type,count
0,10,1001,1,False,fixed,11
1,10,1001,1,False,mobile,11


## AR Model (using AUTO ARIMA) without covariates

In [None]:
pred = n_models.copy()
pred['avg_d_2026-q1']=0
pred['avg_d_2026-q4']=0
pred['avg_d_2030-q1']=0
pred['avg_d_2030-q4']=0
pred['avg_u_2026-q1']=0
pred['avg_u_2026-q4']=0
pred['avg_u_2030-q1']=0
pred['avg_u_2030-q4']=0
print(pred.shape)

for index, m in tqdm.tqdm(pred.iterrows()):
  colnames = ['CDUID', 'SACTYPE', 'is_rural', 'conn_type']
  m_df = df[df[colnames].eq(m[colnames]).all(axis=1)][['avg_d_mbps_wt', 'avg_u_mbps_wt']]
  #display(m_df)
  y_d =  m_df.iloc[:,0]
  y_u =  m_df.iloc[:,1]

  model_ar = auto_arima(y_d,trace=True, error_action='ignore', start_p=1,start_q=1,max_p=3,max_q=3,
                   suppress_warnings=True,stepwise=False,seasonal=False, verbose = False)
  model_ar.fit(y_d)
  d = model_ar.predict(n_periods=36)
  #print(d)
  model_ar.fit(y_u)
  u = model_ar.predict(n_periods=36)
  pred.iloc[index, 6:] = [d[16], d[19], d[32], d[35], u[16], u[19], u[32], u[35]]
  #break
pred.to_csv(colab_dir+'/pred_ARIMA_2026_2030.csv', index=False)
pred.head(2)

In [None]:
pred.to_csv(colab_dir+'/pred_ARIMA_2026_2030.csv', index=False)
pred.head(2)

Unnamed: 0,PRUID,CDUID,SACTYPE,is_rural,conn_type,count,avg_d_2026-q1,avg_d_2026-q4,avg_d_2030-q1,avg_d_2030-q4,avg_u_2026-q1,avg_u_2026-q4,avg_u_2030-q1,avg_u_2030-q4
0,10,1001,1,False,fixed,11,355.116783,381.341489,494.981918,521.206632,146.980864,154.845523,188.925711,196.79037
1,10,1001,1,False,mobile,11,116.822226,113.163025,98.585683,95.497702,23.210006,22.905334,21.630633,21.346693


In [None]:
pred = pd.read_csv(colab_dir+'/pred_ARIMA_2026_2030.csv')

## fixed ARIMA Model without covariates

In [7]:
l = [str(i)+'Q'+str(j) for i in range(2019, 2031) for j in range(1,5)]
print(l)

['2019Q1', '2019Q2', '2019Q3', '2019Q4', '2020Q1', '2020Q2', '2020Q3', '2020Q4', '2021Q1', '2021Q2', '2021Q3', '2021Q4', '2022Q1', '2022Q2', '2022Q3', '2022Q4', '2023Q1', '2023Q2', '2023Q3', '2023Q4', '2024Q1', '2024Q2', '2024Q3', '2024Q4', '2025Q1', '2025Q2', '2025Q3', '2025Q4', '2026Q1', '2026Q2', '2026Q3', '2026Q4', '2027Q1', '2027Q2', '2027Q3', '2027Q4', '2028Q1', '2028Q2', '2028Q3', '2028Q4', '2029Q1', '2029Q2', '2029Q3', '2029Q4', '2030Q1', '2030Q2', '2030Q3', '2030Q4']


In [17]:
# arima with constraints

pred_d = n_models.copy()
pred_u = n_models.copy()
for col in l:
    pred_d[col] = 0
    pred_u[col] = 0

for index, m in tqdm.tqdm(pred_d.iterrows()):
  colnames = ['CDUID', 'SACTYPE', 'is_rural', 'conn_type']
  m_df = df[df[colnames].eq(m[colnames]).all(axis=1)][['avg_d_mbps_wt', 'avg_u_mbps_wt']]
  y_d =  m_df.iloc[:,0]
  y_u =  m_df.iloc[:,1]

  arima = sm.tsa.arima.ARIMA(f(y_d,min(y_d)-(10e-2), MAX_D), order=(1, 1, 0)).fit()
  d = finv(arima.forecast(36),min(y_d)-(10e-2), MAX_D)

  arima = sm.tsa.arima.ARIMA(f(y_u,min(y_u)-(10e-2), MAX_U), order=(1, 1, 0)).fit()
  u = finv(arima.forecast(36),min(y_u)-(10e-2), MAX_U)

  pred_d.iloc[index, 6:] = np.concatenate((y_d, d), axis=None)
  pred_u.iloc[index, 6:] = np.concatenate((y_u, u), axis=None)
  #break
#pred.to_csv(colab_dir+'/pred_ARIMA_fixed_2026_2030.csv', index=False)
display(pred_d.head(2))
display(pred_u.head(2))

2771it [03:21, 13.76it/s]


Unnamed: 0,PRUID,CDUID,SACTYPE,is_rural,conn_type,count,2019Q1,2019Q2,2019Q3,2019Q4,...,2028Q3,2028Q4,2029Q1,2029Q2,2029Q3,2029Q4,2030Q1,2030Q2,2030Q3,2030Q4
0,10,1001,1,False,fixed,11,110.345156,119.111294,119.28984,133.98214,...,207.525878,207.525878,207.525878,207.525878,207.525878,207.525878,207.525878,207.525878,207.525878,207.525878
1,10,1001,1,False,mobile,11,92.795777,87.178801,92.80575,107.989439,...,142.639647,142.639647,142.639647,142.639647,142.639647,142.639647,142.639647,142.639647,142.639647,142.639647


Unnamed: 0,PRUID,CDUID,SACTYPE,is_rural,conn_type,count,2019Q1,2019Q2,2019Q3,2019Q4,...,2028Q3,2028Q4,2029Q1,2029Q2,2029Q3,2029Q4,2030Q1,2030Q2,2030Q3,2030Q4
0,10,1001,1,False,fixed,11,110.345156,119.111294,119.28984,133.98214,...,207.525878,207.525878,207.525878,207.525878,207.525878,207.525878,207.525878,207.525878,207.525878,207.525878
1,10,1001,1,False,mobile,11,92.795777,87.178801,92.80575,107.989439,...,142.639647,142.639647,142.639647,142.639647,142.639647,142.639647,142.639647,142.639647,142.639647,142.639647


In [25]:
pred_d = pred_d.drop(columns=['count'])
pred_d.columns

Index(['PRUID', 'CDUID', 'SACTYPE', 'is_rural', 'conn_type', '2019Q1',
       '2019Q2', '2019Q3', '2019Q4', '2020Q1', '2020Q2', '2020Q3', '2020Q4',
       '2021Q1', '2021Q2', '2021Q3', '2021Q4', '2022Q1', '2022Q2', '2022Q3',
       '2022Q4', '2023Q1', '2023Q2', '2023Q3', '2023Q4', '2024Q1', '2024Q2',
       '2024Q3', '2024Q4', '2025Q1', '2025Q2', '2025Q3', '2025Q4', '2026Q1',
       '2026Q2', '2026Q3', '2026Q4', '2027Q1', '2027Q2', '2027Q3', '2027Q4',
       '2028Q1', '2028Q2', '2028Q3', '2028Q4', '2029Q1', '2029Q2', '2029Q3',
       '2029Q4', '2030Q1', '2030Q2', '2030Q3', '2030Q4'],
      dtype='object')

In [27]:
pred_u = pred_u.drop(columns=['count'])
pred_u.columns

Index(['PRUID', 'CDUID', 'SACTYPE', 'is_rural', 'conn_type', '2019Q1',
       '2019Q2', '2019Q3', '2019Q4', '2020Q1', '2020Q2', '2020Q3', '2020Q4',
       '2021Q1', '2021Q2', '2021Q3', '2021Q4', '2022Q1', '2022Q2', '2022Q3',
       '2022Q4', '2023Q1', '2023Q2', '2023Q3', '2023Q4', '2024Q1', '2024Q2',
       '2024Q3', '2024Q4', '2025Q1', '2025Q2', '2025Q3', '2025Q4', '2026Q1',
       '2026Q2', '2026Q3', '2026Q4', '2027Q1', '2027Q2', '2027Q3', '2027Q4',
       '2028Q1', '2028Q2', '2028Q3', '2028Q4', '2029Q1', '2029Q2', '2029Q3',
       '2029Q4', '2030Q1', '2030Q2', '2030Q3', '2030Q4'],
      dtype='object')

In [18]:
# save quarterly download and upload speed for each model: CDUID, SACTYPE, is_rural, conn_type 
pred_d.to_csv(colab_dir+'/pred_d_ARIMA_2019_2030.csv', index=False)
pred_u.to_csv(colab_dir+'/pred_u_ARIMA_2019_2030.csv', index=False)

In [None]:
# example of plotting
# pred_d columns: 
#       'PRUID', 'CDUID', 'SACTYPE', 'is_rural', 'conn_type', 
#       '2019Q1', '2019Q2', '2019Q3', '2019Q4', '2020Q1', '2020Q2', '2020Q3', '2020Q4',
#       '2021Q1', '2021Q2', '2021Q3', '2021Q4', '2022Q1', '2022Q2', '2022Q3', '2022Q4',
#       '2023Q1', '2023Q2', '2023Q3', '2023Q4', '2024Q1', '2024Q2', '2024Q3', '2024Q4',
#       '2025Q1', '2025Q2', '2025Q3', '2025Q4', '2026Q1', '2026Q2', '2026Q3', '2026Q4',
#       '2027Q1', '2027Q2', '2027Q3', '2027Q4', '2028Q1', '2028Q2', '2028Q3', '2028Q4',
#       '2028Q1', '2028Q2', '2028Q3', '2028Q4', '2029Q1', '2029Q2', '2029Q3', '2029Q4',
#       '2030Q1', '2030Q2', '2030Q3', '2030Q4']
n_iter = 48
for it in range(n_iter):
  ax.cla()
  plot_canada_map(pred_d, it = it, ax = ax)
  plt.pause(0.01)