# Load libraries and define funtions

In [22]:
import sys
from google.colab import drive
drive.mount('/content/gdrive')
colab_dir = '/content/gdrive/My Drive/Colab_files/2022_CCS_case_study'
sys.path.append(colab_dir)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [23]:
%matplotlib inline
from datetime import datetime
import matplotlib
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
plt.style.use('seaborn')
import pandas as pd
import numpy as np

In [97]:
def get_stat_lon(in_df, in_group_list, kb_to_mb = True):
  # get longitudinal summary data grouping by in_group_col 
  if kb_to_mb:
    in_df['avg_d_mbps'] = in_df['avg_d_kbps'] / 1000
    in_df['avg_u_mbps'] = in_df['avg_u_kbps'] / 1000

  county_stats = (
    in_df.groupby(in_group_list)
    .apply(
        lambda x: pd.Series(
            {"avg_d_mbps_wt": np.average(x["avg_d_mbps"], weights=x["tests"]),
             "avg_u_mbps_wt": np.average(x["avg_u_mbps"], weights=x["tests"]),
             "avg_lat_ms": np.average(x["avg_lat_ms"], weights=x["tests"])}
        )
    )
    .reset_index()
    .merge(
        in_df.groupby(in_group_list)
        .agg(tests=("tests", "sum"), DA_POP=("DA_POP", "sum"))
        .reset_index(),
        on=in_group_list,
    )
  )
  return county_stats

def get_speed_info(source, ftr_date, colnames, toFind):
  colnames = list(colnames)
  toFind = list(toFind)
  out = source[source[colnames].eq(toFind).all(axis=1)]
  out = out[out['date']== ftr_date]
  return out

# Read data for the case study challeng


*   define rural areas: rows with NaN for either PCUID, PCNAME, PCTYPE, or PCCLASS (Population centre ID, Population centre name, Population centre type, Population centre size class).
*   Get summary stats: count # test per area (Provinces, Census divisions, dissemination areas) to choose resolution for a model
*   Remark: tile < Dissemination area < Census division < Provinces

In [25]:
df = pd.read_csv(colab_dir+'/ookla-canada-speed-tiles.csv')
print(df.columns)
print(df.shape)

Index(['quadkey', 'avg_d_kbps', 'avg_u_kbps', 'avg_lat_ms', 'tests', 'devices',
       'year', 'quarter', 'conn_type', 'PRUID', 'PRNAME', 'CDUID', 'CDNAME',
       'DAUID', 'SACTYPE', 'DA_POP', 'PCUID', 'PCNAME', 'PCTYPE', 'PCCLASS',
       'geometry'],
      dtype='object')
(2751464, 21)


In [25]:
print(pd.unique(df.PRNAME))
print("# provinces: ", pd.unique(df.PRUID).size)
print("# census divisions: ", pd.unique(df.CDUID).size)
print("# dissimilation areas: ", pd.unique(df.DAUID).size)
print("# quadkeys: ", pd.unique(df.quadkey).size)
print(pd.unique(df.quarter))
print("unique SACTYPE: ", pd.unique(df.SACTYPE))
print("unique PCCLASS: ", pd.unique(df.PCCLASS))
print(pd.unique(df.year))
print(pd.unique(df.conn_type))

['Northwest Territories / Territoires du Nord-Ouest' 'Yukon'
 'British Columbia / Colombie-Britannique' 'Alberta' 'Saskatchewan'
 'Manitoba' 'Ontario'
 'Newfoundland and Labrador / Terre-Neuve-et-Labrador' 'Quebec / Québec'
 'New Brunswick / Nouveau-Brunswick' 'Nova Scotia / Nouvelle-Écosse'
 'Prince Edward Island / Île-du-Prince-Édouard' 'Nunavut']
# provinces:  13
# census divisions:  293
# dissimilation areas:  44062
# quadkeys:  574971
['Q1' 'Q2' 'Q3' 'Q4']
unique SACTYPE:  [8 3 6 7 5 4 2 1]
unique PCCLASS:  [nan  2.  3.  4.]
[2019 2020 2021]
['fixed' 'mobile']


In [26]:
#define rural areas
df['is_rural'] = np.where(df.PCTYPE.isna(), True, False)
df['date'] = pd.to_datetime(df.year.map(str) + "-" + df.quarter.map(str))

In [6]:
df.head(4)

Unnamed: 0,quadkey,avg_d_kbps,avg_u_kbps,avg_lat_ms,tests,devices,year,quarter,conn_type,PRUID,...,DAUID,SACTYPE,DA_POP,PCUID,PCNAME,PCTYPE,PCCLASS,geometry,is_rural,date
0,23331133131332,11910,1408,27,1,1,2019,Q1,fixed,61,...,61010033,8,590.0,,,,,"POLYGON ((4593360.869162522 4089469.903958718,...",True,2019-01-01
1,23331133133011,14969,1554,25,1,1,2019,Q1,fixed,61,...,61010033,8,590.0,,,,,"POLYGON ((4592705.709263269 4089714.237789153,...",True,2019-01-01
2,32202103303220,5038,1317,54,1,1,2019,Q1,fixed,61,...,61010054,8,330.0,,,,,"POLYGON ((4736491.486114961 4146142.702105635,...",True,2019-01-01
3,32220031120102,13419,6169,50,4,1,2019,Q1,fixed,61,...,61010045,8,275.0,,,,,"POLYGON ((4642710.684221942 4070147.11608694, ...",True,2019-01-01


# Aggregate by Census division, SACTYPE, rural/urban, fixed/mobile

In [135]:
# too many missing values. Use census divisions for aggregation
tests_stat = get_stat_lon(df, ['date','PRUID', 'CDUID', 'SACTYPE', 'is_rural','conn_type'])
# save dataset without imputation
tests_stat.to_csv(colab_dir+'/canada_aggregation_CD_SACTYPE_isrural_conn_type.csv',index=False)
print(tests_stat.shape)
tests_stat.head(10) #27 sec

(31259, 11)


Unnamed: 0,date,PRUID,CDUID,SACTYPE,is_rural,conn_type,avg_d_mbps_wt,avg_u_mbps_wt,avg_lat_ms,tests,DA_POP
0,2019-01-01,10,1001,1,False,fixed,110.345156,72.589665,10.505591,6618,550285.0
1,2019-01-01,10,1001,1,False,mobile,92.795777,28.8362,65.107692,130,43085.0
2,2019-01-01,10,1001,1,True,fixed,133.45302,93.332288,7.532551,937,180645.0
3,2019-01-01,10,1001,1,True,mobile,105.861625,44.6315,67.6875,16,6075.0
4,2019-01-01,10,1001,3,False,fixed,107.054376,44.847764,11.644,250,17930.0
5,2019-01-01,10,1001,3,False,mobile,116.633,12.219,83.5,2,1540.0
6,2019-01-01,10,1001,3,True,fixed,117.389714,61.877143,9.238095,21,6915.0
7,2019-01-01,10,1001,3,True,mobile,86.814,5.083,58.0,2,615.0
8,2019-01-01,10,1001,4,False,fixed,6.694,0.931333,11.333333,3,960.0
9,2019-01-01,10,1001,4,True,fixed,78.227453,19.741149,48.430939,181,36570.0


In [137]:
tests_stat = pd.read_csv(colab_dir+'/canada_aggregation_CD_SACTYPE_isrural_conn_type.csv')
tests_stat.head(1)

Unnamed: 0,date,PRUID,CDUID,SACTYPE,is_rural,conn_type,avg_d_mbps_wt,avg_u_mbps_wt,avg_lat_ms,tests,DA_POP
0,2019-01-01,10,1001,1,False,fixed,110.345156,72.589665,10.505591,6618,550285.0


In [138]:
n_models = tests_stat.groupby(['PRUID','CDUID','SACTYPE', 'is_rural','conn_type']).size().reset_index().rename(columns={0:'count'})
print(n_models.shape)
n_models.head()

(2771, 6)


Unnamed: 0,PRUID,CDUID,SACTYPE,is_rural,conn_type,count
0,10,1001,1,False,fixed,12
1,10,1001,1,False,mobile,12
2,10,1001,1,True,fixed,12
3,10,1001,1,True,mobile,12
4,10,1001,3,False,fixed,12


In [139]:
# all models with less than 12 rows are incomplete. Models with all observations: 2319 out of 2771
n_models['count'].value_counts()

12    2319
11     107
10      71
9       57
8       35
7       34
4       33
1       31
6       23
5       23
3       21
2       17
Name: count, dtype: int64

In [140]:
dates = sorted(pd.unique(tests_stat.date), reverse = False)

# Impute data.

*   Use previous observation for imputation if up to 2 time points missed for a model
*   Use agregated data from the same CD and rural/urban status to impute 



In [141]:
# Use previous observation for imputation if 
# - there are up to 2 missed obersvations
# - previous or next is observed (not the first two or last two)

# add rows to dataset for incomplete models
import tqdm
imp_step1 = tests_stat.copy()
imp_step1['imputed'] = False
print(imp_step1.shape)

# subset of incomplete models
inc_models = n_models[(n_models['count']<12) & (n_models['count']>9)] # 178 models
print('# of incomplete model: ',inc_models.shape)

for index, row in tqdm.tqdm(inc_models.iterrows()):
  m_measured = imp_step1[(imp_step1.CDUID == row.CDUID) &
                   (imp_step1.SACTYPE == row.SACTYPE) & 
                    (imp_step1.is_rural == row.is_rural) & 
                    (imp_step1.conn_type == row.conn_type)].copy()
                   
  for i, val in enumerate(dates):
    r = m_measured[m_measured.date == val]

    if val == dates[-1]:
      # for the last datapoint look at the next point
      r_measured = m_measured[m_measured.date == dates[i-1]]
    else:
      r_measured = m_measured[m_measured.date == dates[i+1]]
    
    if (r.shape[0]==0) & (r_measured.shape[0]==1):
      new_raw =  r_measured
      new_raw['date'] = val
      new_raw['imputed'] = True
      imp_step1 = imp_step1.append(new_raw)

print(imp_step1.shape)

(31259, 12)
# of incomplete model:  (178, 6)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
178it [00:04, 38.02it/s]

(31491, 12)





In [143]:
# check missing rate after the first step
n_models_imp = imp_step1.groupby(['PRUID','CDUID','SACTYPE', 'is_rural','conn_type']).size().reset_index().rename(columns={0:'count'})
n_models_imp['count'].value_counts()

12    2480
9       57
8       35
7       34
4       33
1       31
6       23
5       23
3       21
11      17
2       17
Name: count, dtype: int64

In [184]:
# save imputed dataset after step1
imp_step1.to_csv(colab_dir+'/canada_aggregation_CD_SACTYPE_isrural_conn_type_imputed_step1.csv', index=False)

In [185]:
imp_step1 = pd.read_csv(colab_dir+'/canada_aggregation_CD_SACTYPE_isrural_conn_type_imputed_step1.csv')
imp_step1.head(1)

Unnamed: 0,date,PRUID,CDUID,SACTYPE,is_rural,conn_type,avg_d_mbps_wt,avg_u_mbps_wt,avg_lat_ms,tests,DA_POP,imputed
0,2019-01-01,10,1001,1,False,fixed,110.345156,72.589665,10.505591,6618,550285.0,False


In [30]:
# 2. Use information about similar areas for imputation

# source1: use average across SACTYPE, but same province, CD, conn_type, rural/urban classification
imp_souce1 = get_stat_lon(df, ['date','PRUID', 'CDUID', 'is_rural','conn_type'])
n_models_imp1 = imp_souce1.groupby(['PRUID','CDUID','is_rural','conn_type']).size().reset_index().rename(columns={0:'count'})
print(n_models_imp1['count'].value_counts())

# source2: use average across conn_type, but same province, CD, SACTYPE, rural/urban classification
imp_souce2 = get_stat_lon(df, ['date','PRUID', 'CDUID', 'SACTYPE', 'is_rural'])
n_models_imp2 = imp_souce2.groupby(['PRUID','CDUID','SACTYPE', 'is_rural']).size().reset_index().rename(columns={0:'count'})
print(n_models_imp2['count'].value_counts())

# source3: use average across CD, but same province, SACTYPE, conn_type, rural/urban classification
imp_souce3 = get_stat_lon(df, ['date','PRUID', 'SACTYPE', 'is_rural','conn_type'])
n_models_imp3 = imp_souce3.groupby(['PRUID','SACTYPE', 'is_rural','conn_type']).size().reset_index().rename(columns={0:'count'})
print(n_models_imp3['count'].value_counts()) # 36 s

12    1099
11      11
9        7
10       7
8        4
2        3
1        2
4        2
5        1
6        1
Name: count, dtype: int64
12    1330
11      17
10      12
4        7
6        7
9        6
8        6
2        6
7        5
3        4
5        4
1        2
Name: count, dtype: int64
12    245
11      3
6       3
9       2
4       2
7       2
10      1
5       1
2       1
1       1
8       1
Name: count, dtype: int64


In [146]:
# test function get_speed_info for imputation
ftr_date = dates[1]
print(ftr_date)
inc_models = n_models_imp[n_models_imp['count']<12] # 285 models
m = inc_models.iloc[113]
print(m)
s1 = get_speed_info(imp_souce1, ftr_date, imp_souce1.columns[1:5], m[['PRUID', 'CDUID', 'is_rural', 'conn_type']])
s2 = get_speed_info(imp_souce2, ftr_date, imp_souce2.columns[1:5], m[['PRUID', 'CDUID', 'SACTYPE', 'is_rural']])
s3 = get_speed_info(imp_souce3, ftr_date, imp_souce3.columns[1:5], m[['PRUID', 'SACTYPE', 'is_rural', 'conn_type']])
s1 = s1.append(s2)
s1 = s1.append(s3)
s1 = s1.rename(columns={"avg_d_mbps_wt": "avg_d_mbps", "avg_u_mbps_wt": "avg_u_mbps"})
display(s1)
avg_imp = get_stat_lon(s1, ['date'], kb_to_mb = False)
display(avg_imp)

2019-04-01
PRUID           24
CDUID         2421
SACTYPE          7
is_rural      True
conn_type    fixed
count           11
Name: 603, dtype: object


Unnamed: 0,date,PRUID,CDUID,is_rural,conn_type,avg_d_mbps,avg_u_mbps,avg_lat_ms,tests,DA_POP,SACTYPE
1370,2019-04-01,24,2421.0,True,fixed,44.357474,13.616832,38.858557,707,92580.0,
1658,2019-04-01,24,2421.0,True,,15.904214,6.794714,78.964286,28,0.0,7.0
359,2019-04-01,24,,True,fixed,38.647969,23.675149,196.822474,1059,61460.0,7.0


Unnamed: 0,date,avg_d_mbps_wt,avg_u_mbps_wt,avg_lat_ms,tests,DA_POP
0,2019-04-01,40.543061,19.44779,132.730769,1794,154040.0


In [154]:
# Use average from three higher level aggregation to impute 
imp_step2 = imp_step1.copy()
print(imp_step2.shape)

inc_models = n_models_imp[n_models_imp['count']<12] # 285 models
print('# of incomplete model: ',inc_models.shape)

for index, row in tqdm.tqdm(inc_models.iterrows()):
  m_measured = imp_step2[(imp_step2.CDUID == row.CDUID) &
                   (imp_step2.SACTYPE == row.SACTYPE) & 
                    (imp_step2.is_rural == row.is_rural) & 
                    (imp_step2.conn_type == row.conn_type)].copy()
                   
  for i, val in enumerate(dates):
    r = m_measured[m_measured.date == val]
    if (r.shape[0]==0):
      s = get_speed_info(imp_souce1, val, imp_souce1.columns[1:5], row[['PRUID', 'CDUID', 'is_rural', 'conn_type']])
      s = s.append(get_speed_info(imp_souce2, val, imp_souce2.columns[1:5], row[['PRUID', 'CDUID', 'SACTYPE', 'is_rural']]))
      s = s.append(get_speed_info(imp_souce3, val, imp_souce3.columns[1:5], row[['PRUID', 'SACTYPE', 'is_rural', 'conn_type']]))
      s = s.rename(columns={"avg_d_mbps_wt": "avg_d_mbps", "avg_u_mbps_wt": "avg_u_mbps"})

      if s.shape[0]==0:
        print('no info for: ')
        print(row)
        continue
      avg_imp = get_stat_lon(s, ['date'], kb_to_mb = False).iloc[0]
      imp_step2 = imp_step2.append({'date' : val, 'PRUID': row.PRUID, 'CDUID': row.CDUID,'SACTYPE': row.SACTYPE,
                                    'is_rural': row.is_rural,'conn_type': row.conn_type,
                                    'avg_d_mbps_wt': avg_imp.avg_d_mbps_wt, 'avg_u_mbps_wt':	avg_imp.avg_u_mbps_wt,
                                    'avg_lat_ms': avg_imp.avg_lat_ms, 'tests':	avg_imp.tests,'DA_POP': avg_imp.DA_POP,
                                    'imputed' : True} , 
                                    ignore_index=True)
  #break

print(imp_step2.shape)

(31491, 12)
# of incomplete model:  (291, 6)


284it [01:26,  4.21it/s]

no info for: 
PRUID           62
CDUID         6204
SACTYPE          8
is_rural     False
conn_type    fixed
count            5
Name: 2763, dtype: object
no info for: 
PRUID           62
CDUID         6204
SACTYPE          8
is_rural     False
conn_type    fixed
count            5
Name: 2763, dtype: object
no info for: 
PRUID           62
CDUID         6204
SACTYPE          8
is_rural     False
conn_type    fixed
count            5
Name: 2763, dtype: object
no info for: 
PRUID           62
CDUID         6204
SACTYPE          8
is_rural     False
conn_type    fixed
count            5
Name: 2763, dtype: object
no info for: 
PRUID           62
CDUID         6204
SACTYPE          8
is_rural     False
conn_type    fixed
count            5
Name: 2763, dtype: object
no info for: 
PRUID           62
CDUID         6204
SACTYPE          8
is_rural     False
conn_type    fixed
count            5
Name: 2763, dtype: object
no info for: 
PRUID            62
CDUID          6204
SACTYPE           8
is

286it [01:27,  4.02it/s]

no info for: 
PRUID           62
CDUID         6204
SACTYPE          8
is_rural      True
conn_type    fixed
count            6
Name: 2765, dtype: object
no info for: 
PRUID           62
CDUID         6204
SACTYPE          8
is_rural      True
conn_type    fixed
count            6
Name: 2765, dtype: object
no info for: 
PRUID           62
CDUID         6204
SACTYPE          8
is_rural      True
conn_type    fixed
count            6
Name: 2765, dtype: object
no info for: 
PRUID           62
CDUID         6204
SACTYPE          8
is_rural      True
conn_type    fixed
count            6
Name: 2765, dtype: object
no info for: 
PRUID            62
CDUID          6204
SACTYPE           8
is_rural       True
conn_type    mobile
count             4
Name: 2766, dtype: object
no info for: 
PRUID            62
CDUID          6204
SACTYPE           8
is_rural       True
conn_type    mobile
count             4
Name: 2766, dtype: object
no info for: 
PRUID            62
CDUID          6204
SACTYPE   

287it [01:27,  3.55it/s]

no info for: 
PRUID           62
CDUID         6205
SACTYPE          8
is_rural     False
conn_type    fixed
count            1
Name: 2767, dtype: object
no info for: 
PRUID           62
CDUID         6205
SACTYPE          8
is_rural     False
conn_type    fixed
count            1
Name: 2767, dtype: object
no info for: 
PRUID           62
CDUID         6205
SACTYPE          8
is_rural     False
conn_type    fixed
count            1
Name: 2767, dtype: object
no info for: 
PRUID           62
CDUID         6205
SACTYPE          8
is_rural     False
conn_type    fixed
count            1
Name: 2767, dtype: object


288it [01:27,  2.92it/s]

no info for: 
PRUID            62
CDUID          6205
SACTYPE           8
is_rural      False
conn_type    mobile
count             2
Name: 2768, dtype: object
no info for: 
PRUID            62
CDUID          6205
SACTYPE           8
is_rural      False
conn_type    mobile
count             2
Name: 2768, dtype: object
no info for: 
PRUID            62
CDUID          6205
SACTYPE           8
is_rural      False
conn_type    mobile
count             2
Name: 2768, dtype: object
no info for: 
PRUID            62
CDUID          6205
SACTYPE           8
is_rural      False
conn_type    mobile
count             2
Name: 2768, dtype: object
no info for: 
PRUID            62
CDUID          6205
SACTYPE           8
is_rural      False
conn_type    mobile
count             2
Name: 2768, dtype: object


289it [01:28,  2.79it/s]

no info for: 
PRUID            62
CDUID          6205
SACTYPE           8
is_rural       True
conn_type    mobile
count             2
Name: 2769, dtype: object
no info for: 
PRUID            62
CDUID          6205
SACTYPE           8
is_rural       True
conn_type    mobile
count             2
Name: 2769, dtype: object
no info for: 
PRUID            62
CDUID          6205
SACTYPE           8
is_rural       True
conn_type    mobile
count             2
Name: 2769, dtype: object
no info for: 
PRUID            62
CDUID          6205
SACTYPE           8
is_rural       True
conn_type    mobile
count             2
Name: 2769, dtype: object
no info for: 
PRUID            62
CDUID          6205
SACTYPE           8
is_rural       True
conn_type    mobile
count             2
Name: 2769, dtype: object
no info for: 
PRUID            62
CDUID          6205
SACTYPE           8
is_rural       True
conn_type    mobile
count             2
Name: 2769, dtype: object


290it [01:28,  2.79it/s]

no info for: 
PRUID           62
CDUID         6208
SACTYPE          8
is_rural      True
conn_type    fixed
count            2
Name: 2770, dtype: object
no info for: 
PRUID           62
CDUID         6208
SACTYPE          8
is_rural      True
conn_type    fixed
count            2
Name: 2770, dtype: object
no info for: 
PRUID           62
CDUID         6208
SACTYPE          8
is_rural      True
conn_type    fixed
count            2
Name: 2770, dtype: object
no info for: 
PRUID           62
CDUID         6208
SACTYPE          8
is_rural      True
conn_type    fixed
count            2
Name: 2770, dtype: object


291it [01:29,  3.27it/s]

(33214, 12)





In [155]:
imp_step2.tail(2)

Unnamed: 0,date,PRUID,CDUID,SACTYPE,is_rural,conn_type,avg_d_mbps_wt,avg_u_mbps_wt,avg_lat_ms,tests,DA_POP,imputed
33212,2021-07-01,62,6208,8,True,fixed,7.61125,1.27065,15.7,20,19740.0,True
33213,2021-10-01,62,6208,8,True,fixed,14.778941,2.919882,47.647059,17,19645.0,True


In [156]:
# check missing rate after the first step
n_models_imp = imp_step2.groupby(['PRUID','CDUID','SACTYPE', 'is_rural','conn_type']).size().reset_index().rename(columns={0:'count'})
n_models_imp['count'].value_counts()

12    2763
8        4
6        2
7        2
Name: count, dtype: int64

In [157]:
n_models_imp[n_models_imp['count']<12]

Unnamed: 0,PRUID,CDUID,SACTYPE,is_rural,conn_type,count
2763,62,6204,8,False,fixed,6
2764,62,6204,8,False,mobile,7
2765,62,6204,8,True,fixed,8
2766,62,6204,8,True,mobile,8
2767,62,6205,8,False,fixed,8
2768,62,6205,8,False,mobile,7
2769,62,6205,8,True,mobile,6
2770,62,6208,8,True,fixed,8


In [166]:
df[(df.CDUID == 6204) & (df.SACTYPE == 8)].iloc[: , 6:].head(2)

Unnamed: 0,year,quarter,conn_type,PRUID,PRNAME,CDUID,CDNAME,DAUID,SACTYPE,DA_POP,PCUID,PCNAME,PCTYPE,PCCLASS,geometry,is_rural,date,avg_d_mbps,avg_u_mbps
427791,2019,Q3,fixed,62,Nunavut,6204,Baffin,62040060,8,2320.0,306.0,Iqaluit,4.0,2.0,"POLYGON ((7292821.313010476 3241284.004613232,...",False,2019-07-01,39.944,45.873
588264,2019,Q4,fixed,62,Nunavut,6204,Baffin,62040059,8,60.0,,,,,"POLYGON ((6634139.673335976 5215852.982657169,...",True,2019-10-01,117.554,5.729


In [169]:
# sort by PRUID	CDUID	SACTYPE	is_rural	conn_type
imp_step2 = imp_step2.sort_values(['date', 'PRUID','CDUID','SACTYPE','is_rural','conn_type'])

In [170]:
imp_step2.tail(12)

Unnamed: 0,date,PRUID,CDUID,SACTYPE,is_rural,conn_type,avg_d_mbps_wt,avg_u_mbps_wt,avg_lat_ms,tests,DA_POP,imputed
31252,2021-10-01,61,6106,3,True,fixed,98.533908,13.174759,8.471264,87,9535.0,False
31253,2021-10-01,61,6106,3,True,mobile,24.35,4.938286,55.428571,7,1085.0,False
31254,2021-10-01,61,6106,8,True,fixed,17.605893,20.531357,84.071429,28,0.0,False
33181,2021-10-01,61,6106,8,True,mobile,22.294667,12.865727,66.878788,66,11600.0,True
31255,2021-10-01,62,6204,8,False,fixed,13.033437,1.963729,9.604167,48,51385.0,False
31256,2021-10-01,62,6204,8,False,mobile,28.588914,10.210486,34.771429,35,29800.0,False
31257,2021-10-01,62,6204,8,True,fixed,14.778941,2.919882,47.647059,17,19645.0,False
31258,2021-10-01,62,6204,8,True,mobile,85.233333,19.181833,15.0,6,7740.0,False
33198,2021-10-01,62,6205,8,False,fixed,13.033437,1.963729,9.604167,48,51385.0,True
33203,2021-10-01,62,6205,8,False,mobile,28.588914,10.210486,34.771429,35,29800.0,True


In [171]:
# save imputed dataset
imp_step2.to_csv(colab_dir+'/canada_aggregation_CD_SACTYPE_isrural_conn_type_imputed_step2.csv', index=False)

In [None]:
imp_step2 = pd.read_csv(colab_dir+'/canada_aggregation_CD_SACTYPE_isrural_conn_type_imputed_step2.csv')
imp_step2.head(1)

In [182]:
# for the rest 8 models in Nunavut lets use previous observation even though there are more than 2 missing observations
imp_step3 = imp_step2.copy()
print(imp_step3.shape)

# subset of incomplete models
n_models_imp = imp_step2.groupby(['PRUID','CDUID','SACTYPE', 'is_rural','conn_type']).size().reset_index().rename(columns={0:'count'})
inc_models = n_models_imp[n_models_imp['count']<12] # 8 models
print('# of incomplete model: ',inc_models.shape)

for i in range(6):
  for index, row in tqdm.tqdm(inc_models.iterrows()):
    for i, val in enumerate(dates):
      m_measured = imp_step3[(imp_step3.CDUID == row.CDUID) &
                   (imp_step3.SACTYPE == row.SACTYPE) & 
                    (imp_step3.is_rural == row.is_rural) & 
                    (imp_step3.conn_type == row.conn_type)].copy()
      r = m_measured[m_measured.date == val]

      if val == dates[-1]:
        r_measured = m_measured[m_measured.date == dates[i-1]]
      else:
        r_measured = m_measured[m_measured.date == dates[i+1]]
    
      if (r.shape[0]==0) & (r_measured.shape[0]==1):
        new_raw =  r_measured
        new_raw['date'] = val
        new_raw['imputed'] = True
        imp_step3 = imp_step3.append(new_raw)
print(imp_step3.shape)

(33214, 12)
# of incomplete model:  (8, 6)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
8it [00:00, 11.91it/s]
8it [00:00, 14.11it/s]
8it [00:00, 14.73it/s]
8it [00:00, 16.02it/s]
8it [00:00, 16.49it/s]
8it [00:00, 17.05it/s]

(33252, 12)





In [186]:
n_models_imp = imp_step3.groupby(['PRUID','CDUID','SACTYPE', 'is_rural','conn_type']).size().reset_index().rename(columns={0:'count'})
n_models_imp['count'].value_counts()

12    2771
Name: count, dtype: int64

In [187]:
# save imputed dataset
imp_step3.to_csv(colab_dir+'/canada_aggregation_CD_SACTYPE_isrural_conn_type_imputed_step3.csv', index=False)

## split into test and valid datasets

In [190]:
test = imp_step3[imp_step3.date < '2021-10-01']
valid = imp_step3[imp_step3.date == '2021-10-01']
test.to_csv(colab_dir+'/canada_test_imp.csv', index=False)
valid.to_csv(colab_dir+'/canada_valid_imp.csv', index=False)