In [1]:
# Dependencies and Setup
import pandas as pd
import os

In [2]:
# link to FCC data & create the first dataframe
fcc_data = '../resources/fcc/fcc_area_table_december_2019_extract.csv'
fcc_df = pd.read_csv(fcc_data)
fcc_df.head()

Unnamed: 0,type,id,tech,urban_rural,tribal_non,speed,has_0,has_1,has_2,has_3more
0,county,6001,acfosw,R,N,0.2,0,3,0,13621
1,county,6001,acfosw,U,N,0.2,0,4,3,1657698
2,county,6003,acfosw,R,N,0.2,0,0,0,962
3,county,6003,acfosw,R,T,0.2,0,0,0,167
4,county,6005,acfosw,R,N,0.2,0,1,0,24969


In [3]:
#check data types
fcc_df.dtypes

type            object
id               int64
tech            object
urban_rural     object
tribal_non      object
speed          float64
has_0            int64
has_1            int64
has_2            int64
has_3more        int64
dtype: object

In [4]:
# Change ID to a string
fcc_df["id"] = fcc_df["id"].astype("str")
fcc_df.dtypes

type            object
id              object
tech            object
urban_rural     object
tribal_non      object
speed          float64
has_0            int64
has_1            int64
has_2            int64
has_3more        int64
dtype: object

In [5]:
# Add full geo_id to join w/ census tables using sql
fcc_df["geo_id"] = "0500000US0" + fcc_df["id"]
fcc_df.head()

Unnamed: 0,type,id,tech,urban_rural,tribal_non,speed,has_0,has_1,has_2,has_3more,geo_id
0,county,6001,acfosw,R,N,0.2,0,3,0,13621,0500000US06001
1,county,6001,acfosw,U,N,0.2,0,4,3,1657698,0500000US06001
2,county,6003,acfosw,R,N,0.2,0,0,0,962,0500000US06003
3,county,6003,acfosw,R,T,0.2,0,0,0,167,0500000US06003
4,county,6005,acfosw,R,N,0.2,0,1,0,24969,0500000US06005


In [6]:
# Remove existing id and type columns
del fcc_df['id']
del fcc_df['type']
fcc_df.head()

Unnamed: 0,tech,urban_rural,tribal_non,speed,has_0,has_1,has_2,has_3more,geo_id
0,acfosw,R,N,0.2,0,3,0,13621,0500000US06001
1,acfosw,U,N,0.2,0,4,3,1657698,0500000US06001
2,acfosw,R,N,0.2,0,0,0,962,0500000US06003
3,acfosw,R,T,0.2,0,0,0,167,0500000US06003
4,acfosw,R,N,0.2,0,1,0,24969,0500000US06005


In [7]:
#Reorganize columns
fcc_df = fcc_df[["geo_id", "tech", "urban_rural", "tribal_non", "speed", "has_0", "has_1", "has_2", "has_3more"]]
fcc_df.head()

Unnamed: 0,geo_id,tech,urban_rural,tribal_non,speed,has_0,has_1,has_2,has_3more
0,0500000US06001,acfosw,R,N,0.2,0,3,0,13621
1,0500000US06001,acfosw,U,N,0.2,0,4,3,1657698
2,0500000US06003,acfosw,R,N,0.2,0,0,0,962
3,0500000US06003,acfosw,R,T,0.2,0,0,0,167
4,0500000US06005,acfosw,R,N,0.2,0,1,0,24969


In [8]:
#Create a new column to capture total population that has at least 1 provider &
# Create a total population column
fcc_df["has_access"] = fcc_df["has_1"]+fcc_df["has_2"]+fcc_df["has_3more"]
fcc_df["total_pop"] = fcc_df["has_0"]+fcc_df["has_1"]+fcc_df["has_2"]+fcc_df["has_3more"]
fcc_df.head()

Unnamed: 0,geo_id,tech,urban_rural,tribal_non,speed,has_0,has_1,has_2,has_3more,has_access,total_pop
0,0500000US06001,acfosw,R,N,0.2,0,3,0,13621,13624,13624
1,0500000US06001,acfosw,U,N,0.2,0,4,3,1657698,1657705,1657705
2,0500000US06003,acfosw,R,N,0.2,0,0,0,962,962,962
3,0500000US06003,acfosw,R,T,0.2,0,0,0,167,167,167
4,0500000US06005,acfosw,R,N,0.2,0,1,0,24969,24970,24970


In [9]:
# Rename Columns
fcc_df.rename(columns={'tech': 'broadband_tech', 'speed': 'dl_speed'}, inplace=True)
fcc_df.head()

Unnamed: 0,geo_id,broadband_tech,urban_rural,tribal_non,dl_speed,has_0,has_1,has_2,has_3more,has_access,total_pop
0,0500000US06001,acfosw,R,N,0.2,0,3,0,13621,13624,13624
1,0500000US06001,acfosw,U,N,0.2,0,4,3,1657698,1657705,1657705
2,0500000US06003,acfosw,R,N,0.2,0,0,0,962,962,962
3,0500000US06003,acfosw,R,T,0.2,0,0,0,167,167,167
4,0500000US06005,acfosw,R,N,0.2,0,1,0,24969,24970,24970


In [10]:
# Update column data names
broadband_dict = {
    "acfosw": "All Broadband Connections",
    "a": "ADSL",
    "c": "Cable",
    "f": "Fiber",
    "o": "Other",
    "s": "Satellite",
    "w": "Wireless"
}
fcc_df['broadband_tech'].replace(broadband_dict, inplace=True)
fcc_df.head()

Unnamed: 0,geo_id,broadband_tech,urban_rural,tribal_non,dl_speed,has_0,has_1,has_2,has_3more,has_access,total_pop
0,0500000US06001,All Broadband Connections,R,N,0.2,0,3,0,13621,13624,13624
1,0500000US06001,All Broadband Connections,U,N,0.2,0,4,3,1657698,1657705,1657705
2,0500000US06003,All Broadband Connections,R,N,0.2,0,0,0,962,962,962
3,0500000US06003,All Broadband Connections,R,T,0.2,0,0,0,167,167,167
4,0500000US06005,All Broadband Connections,R,N,0.2,0,1,0,24969,24970,24970


In [11]:
# double check count of tech
fcc_df["broadband_tech"].value_counts()

Other                        1120
Satellite                    1120
Fiber                        1120
All Broadband Connections    1120
ADSL                         1120
Wireless                     1120
Cable                        1120
Name: broadband_tech, dtype: int64

In [12]:
# build new condensed table w/ urban rural and drop tribal vs non tribal
fcc_condensed = fcc_df.drop(columns = ["tribal_non"])
fcc_condensed = fcc_condensed.loc[fcc_condensed["broadband_tech"] == "All Broadband Connections", ]
fcc_condensed.head()

Unnamed: 0,geo_id,broadband_tech,urban_rural,dl_speed,has_0,has_1,has_2,has_3more,has_access,total_pop
0,0500000US06001,All Broadband Connections,R,0.2,0,3,0,13621,13624,13624
1,0500000US06001,All Broadband Connections,U,0.2,0,4,3,1657698,1657705,1657705
2,0500000US06003,All Broadband Connections,R,0.2,0,0,0,962,962,962
3,0500000US06003,All Broadband Connections,R,0.2,0,0,0,167,167,167
4,0500000US06005,All Broadband Connections,R,0.2,0,1,0,24969,24970,24970


In [13]:
# Clean up multiple urban and rural tags by county and speed using groupme
fcc_condensed = fcc_condensed.groupby(["geo_id", "urban_rural", "dl_speed"]).sum()
fcc_condensed.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,has_0,has_1,has_2,has_3more,has_access,total_pop
geo_id,urban_rural,dl_speed,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0500000US06001,R,0.2,0,3,0,13621,13624,13624
0500000US06001,R,4.0,0,3,648,12973,13624,13624
0500000US06001,R,10.0,0,3,648,12973,13624,13624
0500000US06001,R,25.0,0,3,937,12684,13624,13624
0500000US06001,R,100.0,4987,2611,3362,2664,8637,13624


In [14]:
# Re-index condensed df
fcc_condensed = pd.DataFrame(fcc_condensed)
fcc_condensed = fcc_condensed.reset_index()
fcc_condensed.head()

Unnamed: 0,geo_id,urban_rural,dl_speed,has_0,has_1,has_2,has_3more,has_access,total_pop
0,0500000US06001,R,0.2,0,3,0,13621,13624,13624
1,0500000US06001,R,4.0,0,3,648,12973,13624,13624
2,0500000US06001,R,10.0,0,3,648,12973,13624,13624
3,0500000US06001,R,25.0,0,3,937,12684,13624,13624
4,0500000US06001,R,100.0,4987,2611,3362,2664,8637,13624


In [15]:
# Create percent of population w/ access column
fcc_condensed["total_access_pop_percent"] = fcc_condensed["has_access"]/fcc_condensed["total_pop"]
fcc_condensed.head()

Unnamed: 0,geo_id,urban_rural,dl_speed,has_0,has_1,has_2,has_3more,has_access,total_pop,total_access_pop_percent
0,0500000US06001,R,0.2,0,3,0,13621,13624,13624,1.0
1,0500000US06001,R,4.0,0,3,648,12973,13624,13624,1.0
2,0500000US06001,R,10.0,0,3,648,12973,13624,13624,1.0
3,0500000US06001,R,25.0,0,3,937,12684,13624,13624,1.0
4,0500000US06001,R,100.0,4987,2611,3362,2664,8637,13624,0.633955


In [16]:
# Create top 100% user access df by urban and rural, group by max to set in ascending order, and select the top 100% access speed
fcc_100ua_one = fcc_condensed.loc[fcc_condensed["total_access_pop_percent"]==1,]
fcc_100ua_two = fcc_100ua_one.groupby(["geo_id", "urban_rural"]).max()["dl_speed"]
fcc_100ua_three = pd.DataFrame(fcc_100ua_two)
fcc_100ua_three = fcc_100ua_three.reset_index()
fcc_100ua_three

Unnamed: 0,geo_id,urban_rural,dl_speed
0,0500000US06001,R,25.0
1,0500000US06001,U,25.0
2,0500000US06003,R,25.0
3,0500000US06005,R,25.0
4,0500000US06005,U,25.0
...,...,...,...
108,0500000US06111,U,25.0
109,0500000US06113,R,25.0
110,0500000US06113,U,25.0
111,0500000US06115,R,25.0


In [17]:
# create new df that speeds by county w/ less than 100% user access
fcc_not100ua_one = fcc_condensed.loc[fcc_condensed["total_access_pop_percent"]<1,]
fcc_not100ua_one

Unnamed: 0,geo_id,urban_rural,dl_speed,has_0,has_1,has_2,has_3more,has_access,total_pop,total_access_pop_percent
4,0500000US06001,R,100.0,4987,2611,3362,2664,8637,13624,0.633955
5,0500000US06001,R,250.0,7174,3845,2605,0,6450,13624,0.473429
6,0500000US06001,R,1000.0,11003,2621,0,0,2621,13624,0.192381
11,0500000US06001,U,100.0,11860,104419,411197,1130229,1645845,1657705,0.992846
12,0500000US06001,U,250.0,61933,688352,738690,168730,1595772,1657705,0.962639
...,...,...,...,...,...,...,...,...,...,...
782,0500000US06115,R,250.0,18832,4404,364,0,4768,23600,0.202034
783,0500000US06115,R,1000.0,22933,667,0,0,667,23600,0.028263
788,0500000US06115,U,100.0,336,27940,26220,572,54732,55068,0.993898
789,0500000US06115,U,250.0,541,52470,2057,0,54527,55068,0.990176


In [18]:
# merge the two user access dfs (drops unnecessary speeds)
fcc_dropped_ur_one = pd.merge(fcc_100ua_three, fcc_condensed, how = "left", left_on=["geo_id", "urban_rural", "dl_speed"], right_on=["geo_id", "urban_rural", "dl_speed"])
fcc_dropped_ur_two = [fcc_not100ua_one,fcc_dropped_ur_one]
fcc_dropped_ur_three = pd.concat(fcc_dropped_ur_two)
fcc_dropped_ur_three

Unnamed: 0,geo_id,urban_rural,dl_speed,has_0,has_1,has_2,has_3more,has_access,total_pop,total_access_pop_percent
4,0500000US06001,R,100.0,4987,2611,3362,2664,8637,13624,0.633955
5,0500000US06001,R,250.0,7174,3845,2605,0,6450,13624,0.473429
6,0500000US06001,R,1000.0,11003,2621,0,0,2621,13624,0.192381
11,0500000US06001,U,100.0,11860,104419,411197,1130229,1645845,1657705,0.992846
12,0500000US06001,U,250.0,61933,688352,738690,168730,1595772,1657705,0.962639
...,...,...,...,...,...,...,...,...,...,...
108,0500000US06111,U,25.0,0,1,1866,811160,813027,813027,1.000000
109,0500000US06113,R,25.0,0,1,7281,13110,20392,20392,1.000000
110,0500000US06113,U,25.0,0,0,5812,194296,200108,200108,1.000000
111,0500000US06115,R,25.0,0,0,343,23257,23600,23600,1.000000


In [19]:
# sort newly created df
fcc_dropped_ur_final = fcc_dropped_ur_three.sort_values(by=['geo_id', 'urban_rural', 'total_access_pop_percent'], ascending = [True, False, False])
fcc_dropped_ur_final

Unnamed: 0,geo_id,urban_rural,dl_speed,has_0,has_1,has_2,has_3more,has_access,total_pop,total_access_pop_percent
1,0500000US06001,U,25.0,0,5,169,1657531,1657705,1657705,1.000000
11,0500000US06001,U,100.0,11860,104419,411197,1130229,1645845,1657705,0.992846
12,0500000US06001,U,250.0,61933,688352,738690,168730,1595772,1657705,0.962639
13,0500000US06001,U,1000.0,767327,804471,84097,1810,890378,1657705,0.537115
0,0500000US06001,R,25.0,0,3,937,12684,13624,13624,1.000000
...,...,...,...,...,...,...,...,...,...,...
790,0500000US06115,U,1000.0,53054,2014,0,0,2014,55068,0.036573
111,0500000US06115,R,25.0,0,0,343,23257,23600,23600,1.000000
781,0500000US06115,R,100.0,18644,4446,510,0,4956,23600,0.210000
782,0500000US06115,R,250.0,18832,4404,364,0,4768,23600,0.202034


In [20]:
# create urban specific condensed table
fcc_dropped_urban_one = fcc_dropped_ur_final.loc[fcc_dropped_ur_final["urban_rural"] == "U", ]
fcc_dropped_urban_two = fcc_dropped_urban_one.drop(columns = ["urban_rural"])
fcc_dropped_urban_final = fcc_dropped_urban_two.rename(columns={'total_access_pop_percent': 'urban_access_pop_percent'})
fcc_dropped_urban_final.head()

Unnamed: 0,geo_id,dl_speed,has_0,has_1,has_2,has_3more,has_access,total_pop,urban_access_pop_percent
1,0500000US06001,25.0,0,5,169,1657531,1657705,1657705,1.0
11,0500000US06001,100.0,11860,104419,411197,1130229,1645845,1657705,0.992846
12,0500000US06001,250.0,61933,688352,738690,168730,1595772,1657705,0.962639
13,0500000US06001,1000.0,767327,804471,84097,1810,890378,1657705,0.537115
4,0500000US06005,25.0,0,0,92,14676,14768,14768,1.0


In [21]:
# create rural specific condensed table
fcc_dropped_rural_one = fcc_dropped_ur_final.loc[fcc_dropped_ur_final["urban_rural"] == "R", ]
fcc_dropped_rural_two = fcc_dropped_rural_one.drop(columns = ["urban_rural"])
fcc_dropped_rural_final = fcc_dropped_rural_two.rename(columns={'total_access_pop_percent': 'rural_access_pop_percent'})
fcc_dropped_rural_final.head()

Unnamed: 0,geo_id,dl_speed,has_0,has_1,has_2,has_3more,has_access,total_pop,rural_access_pop_percent
0,0500000US06001,25.0,0,3,937,12684,13624,13624,1.0
4,0500000US06001,100.0,4987,2611,3362,2664,8637,13624,0.633955
5,0500000US06001,250.0,7174,3845,2605,0,6450,13624,0.473429
6,0500000US06001,1000.0,11003,2621,0,0,2621,13624,0.192381
2,0500000US06003,25.0,0,0,923,206,1129,1129,1.0


In [22]:
# build new condensed table w/out urban rural
fcc_condensed_total = fcc_condensed.drop(columns = ["urban_rural", "total_access_pop_percent"])
fcc_condensed_total

Unnamed: 0,geo_id,dl_speed,has_0,has_1,has_2,has_3more,has_access,total_pop
0,0500000US06001,0.2,0,3,0,13621,13624,13624
1,0500000US06001,4.0,0,3,648,12973,13624,13624
2,0500000US06001,10.0,0,3,648,12973,13624,13624
3,0500000US06001,25.0,0,3,937,12684,13624,13624
4,0500000US06001,100.0,4987,2611,3362,2664,8637,13624
...,...,...,...,...,...,...,...,...
786,0500000US06115,10.0,0,0,0,55068,55068,55068
787,0500000US06115,25.0,0,0,25,55043,55068,55068
788,0500000US06115,100.0,336,27940,26220,572,54732,55068
789,0500000US06115,250.0,541,52470,2057,0,54527,55068


In [23]:
# Consolidate by GeoID
fcc_condensed_total = fcc_condensed_total.groupby(["geo_id", "dl_speed"]).sum()
fcc_condensed_total.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,has_0,has_1,has_2,has_3more,has_access,total_pop
geo_id,dl_speed,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0500000US06001,0.2,0,7,3,1671319,1671329,1671329
0500000US06001,4.0,0,8,817,1670504,1671329,1671329
0500000US06001,10.0,0,8,817,1670504,1671329,1671329
0500000US06001,25.0,0,8,1106,1670215,1671329,1671329
0500000US06001,100.0,16847,107030,414559,1132893,1654482,1671329


In [24]:
# Re-index
fcc_condensed_total = pd.DataFrame(fcc_condensed_total)
fcc_condensed_total = fcc_condensed_total.reset_index()
fcc_condensed_total.head()

Unnamed: 0,geo_id,dl_speed,has_0,has_1,has_2,has_3more,has_access,total_pop
0,0500000US06001,0.2,0,7,3,1671319,1671329,1671329
1,0500000US06001,4.0,0,8,817,1670504,1671329,1671329
2,0500000US06001,10.0,0,8,817,1670504,1671329,1671329
3,0500000US06001,25.0,0,8,1106,1670215,1671329,1671329
4,0500000US06001,100.0,16847,107030,414559,1132893,1654482,1671329


In [25]:
# Create percent of population w/ access column
fcc_condensed_total["total_access_pop_percent"] = fcc_condensed_total["has_access"]/fcc_condensed_total["total_pop"]
fcc_condensed_total.head()

Unnamed: 0,geo_id,dl_speed,has_0,has_1,has_2,has_3more,has_access,total_pop,total_access_pop_percent
0,0500000US06001,0.2,0,7,3,1671319,1671329,1671329,1.0
1,0500000US06001,4.0,0,8,817,1670504,1671329,1671329,1.0
2,0500000US06001,10.0,0,8,817,1670504,1671329,1671329,1.0
3,0500000US06001,25.0,0,8,1106,1670215,1671329,1671329,1.0
4,0500000US06001,100.0,16847,107030,414559,1132893,1654482,1671329,0.98992


In [26]:
# Create top 100% user access table by urban and rural
fcc_100ua_tot_one = fcc_condensed_total.loc[fcc_condensed_total["total_access_pop_percent"]==1,]
fcc_100ua_tot_two = fcc_100ua_tot_one.groupby(["geo_id"]).max()["dl_speed"]
fcc_100ua_tot_three = pd.DataFrame(fcc_100ua_tot_two)
fcc_100ua_tot_three = fcc_100ua_tot_three.reset_index()
fcc_100ua_tot_three.head()

Unnamed: 0,geo_id,dl_speed
0,0500000US06001,25.0
1,0500000US06003,25.0
2,0500000US06005,25.0
3,0500000US06007,25.0
4,0500000US06009,25.0


In [27]:
# find speeds w/ less than 100% user access
fcc_not100ua_total_one = fcc_condensed_total.loc[fcc_condensed_total["total_access_pop_percent"]<1,]
fcc_not100ua_total_one

Unnamed: 0,geo_id,dl_speed,has_0,has_1,has_2,has_3more,has_access,total_pop,total_access_pop_percent
4,0500000US06001,100.0,16847,107030,414559,1132893,1654482,1671329,0.989920
5,0500000US06001,250.0,69107,692197,741295,168730,1602222,1671329,0.958651
6,0500000US06001,1000.0,778330,807092,84097,1810,892999,1671329,0.534305
11,0500000US06003,100.0,997,130,2,0,132,1129,0.116918
12,0500000US06003,250.0,1100,29,0,0,29,1129,0.025686
...,...,...,...,...,...,...,...,...,...
397,0500000US06113,250.0,73307,118380,27999,814,147193,220500,0.667542
398,0500000US06113,1000.0,134185,68699,17616,0,86315,220500,0.391451
403,0500000US06115,100.0,18980,32386,26730,572,59688,78668,0.758733
404,0500000US06115,250.0,19373,56874,2421,0,59295,78668,0.753737


In [28]:
# merge the two user access dfs (drops unnecessary speeds)
fcc_dropped_one = pd.merge(fcc_100ua_tot_three, fcc_condensed_total, how = "left", left_on=["geo_id", "dl_speed"], right_on=["geo_id", "dl_speed"])
fcc_dropped_two = [fcc_not100ua_total_one,fcc_dropped_one]
fcc_dropped_three = pd.concat(fcc_dropped_two)
fcc_dropped_three

Unnamed: 0,geo_id,dl_speed,has_0,has_1,has_2,has_3more,has_access,total_pop,total_access_pop_percent
4,0500000US06001,100.0,16847,107030,414559,1132893,1654482,1671329,0.989920
5,0500000US06001,250.0,69107,692197,741295,168730,1602222,1671329,0.958651
6,0500000US06001,1000.0,778330,807092,84097,1810,892999,1671329,0.534305
11,0500000US06003,100.0,997,130,2,0,132,1129,0.116918
12,0500000US06003,250.0,1100,29,0,0,29,1129,0.025686
...,...,...,...,...,...,...,...,...,...
53,0500000US06107,25.0,0,0,2406,463787,466193,466193,1.000000
54,0500000US06109,25.0,0,0,348,54130,54478,54478,1.000000
55,0500000US06111,25.0,0,3,12060,833943,846006,846006,1.000000
56,0500000US06113,25.0,0,1,13093,207406,220500,220500,1.000000


In [29]:
# sort newly created df
fcc_dropped_final = fcc_dropped_three.sort_values(by=['geo_id', 'total_access_pop_percent'], ascending = [True, False])
fcc_dropped_final

Unnamed: 0,geo_id,dl_speed,has_0,has_1,has_2,has_3more,has_access,total_pop,total_access_pop_percent
0,0500000US06001,25.0,0,8,1106,1670215,1671329,1671329,1.000000
4,0500000US06001,100.0,16847,107030,414559,1132893,1654482,1671329,0.989920
5,0500000US06001,250.0,69107,692197,741295,168730,1602222,1671329,0.958651
6,0500000US06001,1000.0,778330,807092,84097,1810,892999,1671329,0.534305
1,0500000US06003,25.0,0,0,923,206,1129,1129,1.000000
...,...,...,...,...,...,...,...,...,...
398,0500000US06113,1000.0,134185,68699,17616,0,86315,220500,0.391451
57,0500000US06115,25.0,0,0,368,78300,78668,78668,1.000000
403,0500000US06115,100.0,18980,32386,26730,572,59688,78668,0.758733
404,0500000US06115,250.0,19373,56874,2421,0,59295,78668,0.753737


In [30]:
# Urban vs Rural Population Split by county
urban_rural_by_county_one = fcc_df.drop(columns = ["tribal_non"])
urban_rural_by_county_two = urban_rural_by_county_one.loc[urban_rural_by_county_one["dl_speed"] == 0.2, ]
urban_rural_by_county_three = urban_rural_by_county_two.loc[urban_rural_by_county_two["broadband_tech"] == "All Broadband Connections", ]
urban_rural_by_county_three.head()

Unnamed: 0,geo_id,broadband_tech,urban_rural,dl_speed,has_0,has_1,has_2,has_3more,has_access,total_pop
0,0500000US06001,All Broadband Connections,R,0.2,0,3,0,13621,13624,13624
1,0500000US06001,All Broadband Connections,U,0.2,0,4,3,1657698,1657705,1657705
2,0500000US06003,All Broadband Connections,R,0.2,0,0,0,962,962,962
3,0500000US06003,All Broadband Connections,R,0.2,0,0,0,167,167,167
4,0500000US06005,All Broadband Connections,R,0.2,0,1,0,24969,24970,24970


In [31]:
# use groupby to consolidate urban and rural by county in the population table
urban_rural_by_county_four = urban_rural_by_county_three.groupby(["geo_id", "urban_rural"]).sum()["total_pop"]
urban_rural_by_county_five = pd.DataFrame(urban_rural_by_county_four)
urban_rural_by_county_final = urban_rural_by_county_five.reset_index()
urban_rural_by_county_final.head()

Unnamed: 0,geo_id,urban_rural,total_pop
0,0500000US06001,R,13624
1,0500000US06001,U,1657705
2,0500000US06003,R,1129
3,0500000US06005,R,24983
4,0500000US06005,U,14768


In [32]:
# create urban specific table total pop
urban_total_pop_one = urban_rural_by_county_final.loc[urban_rural_by_county_final["urban_rural"] == "U", ]
urban_total_pop_two = urban_total_pop_one.drop(columns = ["urban_rural"])
urban_total_pop_final = urban_total_pop_two.rename(columns={'total_pop': 'urban_pop'})
urban_total_pop_final.head()

Unnamed: 0,geo_id,urban_pop
1,0500000US06001,1657705
4,0500000US06005,14768
6,0500000US06007,172495
8,0500000US06009,11045
10,0500000US06011,14362


In [33]:
# create rural specific table total pop
rural_total_pop_one = urban_rural_by_county_final.loc[urban_rural_by_county_final["urban_rural"] == "R", ]
rural_total_pop_two = rural_total_pop_one.drop(columns = ["urban_rural"])
rural_total_pop_final = rural_total_pop_two.rename(columns={'total_pop': 'rural_pop'})
rural_total_pop_final.head()

Unnamed: 0,geo_id,rural_pop
0,0500000US06001,13624
2,0500000US06003,1129
3,0500000US06005,24983
5,0500000US06007,46691
7,0500000US06009,34860


In [34]:
# merge final dfs to create machine learning model data
merge_one = pd.merge(fcc_dropped_final, fcc_dropped_urban_final[['geo_id','dl_speed', 'urban_access_pop_percent']], on=['geo_id','dl_speed'])
merge_two = pd.merge(merge_one, fcc_dropped_rural_final[['geo_id','dl_speed', 'rural_access_pop_percent']], on=['geo_id','dl_speed'])
merge_three = pd.merge(merge_two, urban_total_pop_final[['geo_id', 'urban_pop']], on=['geo_id'])
fcc_ml_by_county = pd.merge(merge_three, rural_total_pop_final[['geo_id', 'rural_pop']], on=['geo_id'])
fcc_ml_by_county.head()

Unnamed: 0,geo_id,dl_speed,has_0,has_1,has_2,has_3more,has_access,total_pop,total_access_pop_percent,urban_access_pop_percent,rural_access_pop_percent,urban_pop,rural_pop
0,0500000US06001,25.0,0,8,1106,1670215,1671329,1671329,1.0,1.0,1.0,1657705,13624
1,0500000US06001,100.0,16847,107030,414559,1132893,1654482,1671329,0.98992,0.992846,0.633955,1657705,13624
2,0500000US06001,250.0,69107,692197,741295,168730,1602222,1671329,0.958651,0.962639,0.473429,1657705,13624
3,0500000US06001,1000.0,778330,807092,84097,1810,892999,1671329,0.534305,0.537115,0.192381,1657705,13624
4,0500000US06005,25.0,0,1,957,38793,39751,39751,1.0,1.0,1.0,14768,24983


In [35]:
# reorganize columns for machile learning df
fcc_ml_by_county_final = fcc_ml_by_county[["geo_id", 
                                           "dl_speed", 
                                           "has_0", 
                                           "has_1", 
                                           "has_2", 
                                           "has_3more",
                                           "has_access", 
                                           "total_pop", 
                                           "urban_pop", 
                                           "rural_pop", 
                                           "total_access_pop_percent", 
                                           "urban_access_pop_percent", 
                                           "rural_access_pop_percent"
                                          ]]
fcc_ml_by_county_final.head()

Unnamed: 0,geo_id,dl_speed,has_0,has_1,has_2,has_3more,has_access,total_pop,urban_pop,rural_pop,total_access_pop_percent,urban_access_pop_percent,rural_access_pop_percent
0,0500000US06001,25.0,0,8,1106,1670215,1671329,1671329,1657705,13624,1.0,1.0,1.0
1,0500000US06001,100.0,16847,107030,414559,1132893,1654482,1671329,1657705,13624,0.98992,0.992846,0.633955
2,0500000US06001,250.0,69107,692197,741295,168730,1602222,1671329,1657705,13624,0.958651,0.962639,0.473429
3,0500000US06001,1000.0,778330,807092,84097,1810,892999,1671329,1657705,13624,0.534305,0.537115,0.192381
4,0500000US06005,25.0,0,1,957,38793,39751,39751,14768,24983,1.0,1.0,1.0


In [36]:
# Create by count target df summary and rename dl_speed
ml_summary_one = fcc_100ua_tot_three
ml_summary_one.rename(columns={'dl_speed': 'dl_speed_100_ua'}, inplace=True)
ml_summary_one.head()

Unnamed: 0,geo_id,dl_speed_100_ua
0,0500000US06001,25.0
1,0500000US06003,25.0
2,0500000US06005,25.0
3,0500000US06007,25.0
4,0500000US06009,25.0


In [37]:
#Create next download speed column
#df.loc[df[‘column name’] condition, ‘new column name’] = ‘value if condition is met’
ml_summary_one.loc[ml_summary_one['dl_speed_100_ua'] == 0.2, 'dl_speed'] = 4
ml_summary_one.loc[ml_summary_one['dl_speed_100_ua'] == 4, 'dl_speed'] = 10
ml_summary_one.loc[ml_summary_one['dl_speed_100_ua'] == 10, 'dl_speed'] = 25
ml_summary_one.loc[ml_summary_one['dl_speed_100_ua'] == 25, 'dl_speed'] = 100
ml_summary_one.loc[ml_summary_one['dl_speed_100_ua'] == 100, 'dl_speed'] = 250
ml_summary_one.loc[ml_summary_one['dl_speed_100_ua'] == 250, 'dl_speed'] = 1000
ml_summary_one.head()

Unnamed: 0,geo_id,dl_speed_100_ua,dl_speed
0,0500000US06001,25.0,100.0
1,0500000US06003,25.0,100.0
2,0500000US06005,25.0,100.0
3,0500000US06007,25.0,100.0
4,0500000US06009,25.0,100.0


In [38]:
# check dtypes of ml df
ml_summary_one.dtypes

geo_id              object
dl_speed_100_ua    float64
dl_speed           float64
dtype: object

In [39]:
# Merge new ml df with fcc final table to get the % of next level population
ml_summary_final = pd.merge(ml_summary_one, fcc_dropped_final[['geo_id','dl_speed', 'total_access_pop_percent']], on=['geo_id','dl_speed'])
ml_summary_final.rename(columns={'dl_speed': 'dl_speed_level_up', 'total_access_pop_percent': 'level_up_access_pop_percent'}, inplace=True)
ml_summary_final.head()

Unnamed: 0,geo_id,dl_speed_100_ua,dl_speed_level_up,level_up_access_pop_percent
0,0500000US06001,25.0,100.0,0.98992
1,0500000US06003,25.0,100.0,0.116918
2,0500000US06005,25.0,100.0,0.325451
3,0500000US06007,25.0,100.0,0.878172
4,0500000US06009,25.0,100.0,0.812635


In [40]:
# Export dfs as CSV files
urban_rural_by_county_final.to_csv('../resources/fcc/urban_rural_by_county.csv', index=False)
#fcc_df.to_csv('../resources/fcc/fcc_area_dec_2019.csv', index=False)
fcc_ml_by_county_final.to_csv('../resources/fcc/fcc_ml_by_county.csv', index=False)
ml_summary_final.to_csv('../resources/fcc/fcc_ml_summary_final.csv', index=False)