In [3]:
!pip install geopandas --quiet
import os
import pandas as pd
import pickle
import geopandas as gpd

os.system('cls')

os.chdir('/content/drive/MyDrive/shared/ssc22-case-comp/dataset/')
print(os.getcwd())

/content/drive/MyDrive/shared/ssc22-case-comp/dataset


## 2022-03-22

Here, we want to prepare data for fitting mixed effects models. For example,

$$ Y_{d} = \alpha + \beta_1 X_1 + \beta_2 X_2 $$

where $Y_d$: download speed, $X_1$: time (fixed effect), $X_2$: dissemination area (categorical, random effect), $\beta_2 \sim N(\mu, \sigma)$.

We want to first test it on the Manitoba data.

In [11]:
# Load Manitoba data
file_path = './Manitoba/Manitoba-speed-tiles.shp'
mb_data = gpd.read_file(file_path)
mb_data.columns

Index(['quadkey', 'avg_d_kbps', 'avg_u_kbps', 'avg_lat_ms', 'tests', 'devices',
       'year', 'quarter', 'conn_type', 'PRUID', 'PRNAME', 'CDUID', 'CDNAME',
       'DAUID', 'SACTYPE', 'DA_POP', 'PCUID', 'PCNAME', 'PCTYPE', 'PCCLASS',
       'geometry'],
      dtype='object')

Create a subset of the data. We are going to use the categorical variables as our covariates, as well as some counts data. 

Target variables and descriptions:

* quadkey: tile identifier
* avg_d_kbps: download speed in kbps
* avg_u_kbps: upload speed in kbps
* avg_lat_ms: latency in ms
* devices: number of devices in each tile (counts)
* year: test year
* quarter: test quarter
* conn_type: connection type (binary: fixed, mobile)
* PRUID: province ID in two digits
* CDUID: census division ID
* DAUID: dissemination area ID
* SACTYPE: statistical classification code (categorical)
* DA_POP: dissemination area population (integer, NA)
* PCTYPE: population centre type (rural, small, medium, large)
* PCCLASS: population centre size class (small, medium, large)

i.e. we drop tests, PRNAME, CDNAME, PCUID, PCNAME and geometry from our data.

After the subsetting, We make the target columns factors.

In [25]:
# Combine 'year' and 'quarter' into 'time', and then make it a factor
mb_data['time'] = mb_data['year'] + mb_data['quarter']
mb_data['time_cat'] = pd.factorize(mb_data['time'], sort=True)[0]+1

# Drop unnecessary cols
drop_cols = ['year','quarter', 'time', 'tests', 'PRNAME', 'CDNAME', 'PCUID', 'PCNAME', 'geometry']
mb_sub1 = mb_data.drop(drop_cols, axis=1)

# Now we make the target cols as factors starting with 1
cat_cols = ['PRUID', 'CDUID', 'DAUID', 'PCTYPE', 'PCCLASS']

for c in cat_cols:
    new_cname = c[0:2] + '_cat'
    mb_sub1[new_cname] = pd.factorize(mb_sub1[c], sort=True)[0] + 1
    
mb_sub2 = mb_sub1.drop(cat_cols, axis=1)
mb_sub2.columns

# For now we only use conn_type='fixed'
mb_sub3 = mb_sub2[mb_sub2['conn_type']=='fixed'].drop('conn_type', axis=1)
mb_sub3.columns


Index(['quadkey', 'avg_d_kbps', 'avg_u_kbps', 'avg_lat_ms', 'devices',
       'SACTYPE', 'DA_POP', 'time_cat', 'PR_cat', 'CD_cat', 'DA_cat',
       'PC_cat'],
      dtype='object')

In [26]:
# save data in .csv
out_path = './Manitoba/MB_cat.csv'
mb_sub2.to_csv(out_path, index=False, header=True, na_rep='NA')

In [27]:
mb_sub2.head()

Unnamed: 0,quadkey,avg_d_kbps,avg_u_kbps,avg_lat_ms,devices,conn_type,SACTYPE,DA_POP,time_cat,PR_cat,CD_cat,DA_cat,PC_cat
0,211322332323210,645,897,29,1,fixed,4,625.0,1,1,22,1784,0
1,211323223323002,29975,30659,11,3,fixed,7,,1,1,22,1759,0
2,211330100002331,6839,804,26,1,fixed,6,525.0,1,1,23,1814,0
3,211330100002333,6832,1029,28,1,fixed,6,525.0,1,1,23,1814,0
4,211330100003220,6820,1168,28,1,fixed,6,525.0,1,1,23,1814,0
