### Whiteboarding Follow Ups

- Price of 10 Gbps circuit is going to be $2500 - who’s already spending that?

In [1]:
import numpy as np
import os
import psycopg2
import pandas as pd

In [2]:
HOST_DAR = os.environ.get("HOST_DAR")
USER_DAR = os.environ.get("USER_DAR")
PASSWORD_DAR = os.environ.get("PASSWORD_DAR")
DB_DAR = os.environ.get("DB_DAR")
PORT_DAR = os.environ.get("PORT_DAR")
GITHUB = os.environ.get("GITHUB")

In [8]:
#open connection to DB
myConnection = psycopg2.connect( host=HOST_DAR, 
                                user=USER_DAR, 
                                password=PASSWORD_DAR, 
                                database=DB_DAR, 
                                port=PORT_DAR)

sql_query = """SELECT
  d.district_id,
  d.funding_year,
  d.in_universe,
  d.district_type,
  d.state_code,
  d.locale,
  d.size,
  d.num_students,
  dffa.fit_for_ia,
  dffa.fit_for_ia_cost,
  dbw.ia_bw_mbps_total,
  dbw.ia_monthly_cost_total,
  dbw.projected_bw_fy2018,
  dbw.meeting_2014_goal_no_oversub,
  dbw.meeting_2018_goal_oversub
FROM
  ps.districts d
  JOIN ps.districts_bw_cost dbw
  ON d.district_id = dbw.district_id
  AND d.funding_year = dbw.funding_year
  JOIN ps.districts_fit_for_analysis dffa
  ON d.district_id = dffa.district_id
  AND d.funding_year = dffa.funding_year
WHERE d.funding_year = 2019

"""

#pull bandwidths from DB
cur = myConnection.cursor()
cur.execute(sql_query)
names = [x[0] for x in cur.description]
rows = cur.fetchall()
df = pd.DataFrame(rows, columns=names)

In [9]:
df.head()

Unnamed: 0,district_id,funding_year,in_universe,district_type,state_code,locale,size,num_students,fit_for_ia,fit_for_ia_cost,ia_bw_mbps_total,ia_monthly_cost_total,projected_bw_fy2018,meeting_2014_goal_no_oversub,meeting_2018_goal_oversub
0,881421,2019,True,Traditional,AL,Rural,Medium,5687,True,True,2000,5284.84,3980.9,True,False
1,881422,2019,True,Traditional,CO,Rural,Small,194,True,True,1000,1748.8406788102325,194.0,True,True
2,881423,2019,True,Traditional,AR,Rural,Small,1319,True,True,2000,4273.410994559036,1319.0,True,True
3,881424,2019,True,Traditional,AZ,Urban,Medium,5387,True,True,1000,4946.0,3770.9,True,False
4,881427,2019,True,Traditional,AL,Urban,Large,13938,True,True,10000,612.5,9756.6,True,True


### Price of 10 Gbps circuit is going to be $2500 - who’s already spending that?

In [11]:
# convert to floats
numeric_cols = ['num_students', 'ia_bw_mbps_total', 'ia_monthly_cost_total','projected_bw_fy2018']
df[numeric_cols] = df[numeric_cols].astype(float)

In [25]:
# masks/filters
# usual filters
mask_traditional = df.district_type == 'Traditional'
mask_fit_ia = df.fit_for_ia == True
mask_fit_cost = df.fit_for_ia_cost == True
mask_in_universe = df.in_universe == True

# meeting goal filters
mask_not_meeting = df.meeting_2018_goal_oversub == False
mask_meeting = df.meeting_2018_goal_oversub == True

# mrc at $2500 or less
mask_2500_or_greater = df.ia_monthly_cost_total >= 2500

In [48]:
num_total_sample = df[mask_traditional & mask_fit_ia & mask_fit_cost & mask_in_universe].district_id.nunique()
num_districts_2500_or_greater = df[mask_traditional & mask_fit_ia & mask_fit_cost & mask_in_universe & mask_2500_or_greater].district_id.nunique()

print(f"Number of districts in sample: {num_total_sample}")
print(f"Number of districts in sample with MRC of $2500 or greater: {num_districts_2500_or_greater}")
print(f"Percent of districts at $2500 or greater: {round((num_districts_2500_or_greater/num_total_sample)*100, 2)}%")



Number of districts in sample: 12205
Number of districts in sample with MRC of $2500 or greater: 5032
Percent of districts at $2500 or greater: 41.23%


In [28]:
df_2500 = df[mask_traditional & mask_fit_ia & mask_fit_cost & mask_in_universe & mask_2500_or_greater]
df_2500.head()

Unnamed: 0,district_id,funding_year,in_universe,district_type,state_code,locale,size,num_students,fit_for_ia,fit_for_ia_cost,ia_bw_mbps_total,ia_monthly_cost_total,projected_bw_fy2018,meeting_2014_goal_no_oversub,meeting_2018_goal_oversub
0,881421,2019,True,Traditional,AL,Rural,Medium,5687.0,True,True,2000.0,5284.84,3980.9,True,False
2,881423,2019,True,Traditional,AR,Rural,Small,1319.0,True,True,2000.0,4273.410995,1319.0,True,True
3,881424,2019,True,Traditional,AZ,Urban,Medium,5387.0,True,True,1000.0,4946.0,3770.9,True,False
5,881428,2019,True,Traditional,AZ,Town,Small,2297.0,True,True,1000.0,2864.0,2297.0,True,False
6,881431,2019,True,Traditional,AL,Suburban,Medium,10440.0,True,True,7500.0,16497.22,7308.0,True,True


### Breakdown by Locale, Size

In [64]:
# concat series together
df_size = pd.concat([df_2500['size'].value_counts(), 
                     df_2500.groupby('size').num_students.sum(),
                    (df_2500['size'].value_counts()/df_2500['size'].shape[0]), 
                    df_2500.groupby('size').num_students.sum()/df_2500.num_students.sum()],
                    axis=1)
# change column names
df_size.columns = ['district_count', 'num_students','district_pct', 'students_pct']
df_size

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


Unnamed: 0,district_count,num_students,district_pct,students_pct
Large,636,11256420.0,0.126391,0.329962
Medium,1607,8303484.0,0.319356,0.243402
Mega,137,10933577.0,0.027226,0.320498
Small,2360,3509497.0,0.468998,0.102875
Tiny,292,111311.0,0.058029,0.003263


In [65]:
df_size.to_csv("districts_mrc_2500_size.csv")

In [67]:
# concat series together
df_locale = pd.concat([df_2500['locale'].value_counts(), 
                       df_2500.groupby('locale').num_students.sum(),
                     (df_2500['locale'].value_counts()/df_2500['locale'].shape[0]), 
                       df_2500.groupby('locale').num_students.sum()/df_2500.num_students.sum(),
                      ], 
                    axis=1)
# change column names
df_locale.columns = ['district_count', 'num_students','district_pct', 'students_pct']
df_locale

Unnamed: 0_level_0,district_count,num_students,district_pct,students_pct
locale,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Rural,1993,3963036.0,0.396065,0.116169
Suburban,1468,15453084.0,0.291733,0.45298
Town,1052,3152678.0,0.209062,0.092415
Urban,519,11545491.0,0.10314,0.338436


In [69]:
df_locale.to_csv("districts_mrc_2500_locale.csv")

In [70]:
num_meeting1mbps_2500 = df_2500[df_2500.meeting_2018_goal_oversub == True].district_id.nunique()
num_notmeeting1mbps_2500 = df_2500[df_2500.meeting_2018_goal_oversub == False].district_id.nunique()

print(f"Number of Districts Meeting 1 Mbps and MRC is $2500 or greater: {num_meeting1mbps_2500} ({round((num_meeting1mbps_2500/num_districts_2500_or_greater)*100, 2)}%)")
print(f"Number of Districts Not Meeting 1 Mbps and MRC is $2500 or greater: {num_notmeeting1mbps_2500} ({round((num_notmeeting1mbps_2500/num_districts_2500_or_greater)*100, 2)}%)")
print(f"Double check: {num_meeting1mbps_2500+num_notmeeting1mbps_2500}")

Number of Districts Meeting 1 Mbps and MRC is $2500 or greater: 1776 (35.29%)
Number of Districts Not Meeting 1 Mbps and MRC is $2500 or greater: 3256 (64.71%)
Double check: 5032
