## Imports

In [53]:
import io
import re
import json
import string
import requests
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup
import matplotlib.colors as colors
from pandas.io.json import json_normalize

# graphing 
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
%matplotlib inline


## Install Imports then Import them!

In [16]:
# import folium if cant install then import
try:
    import folium
except:
    !pip install folium
    print("installed {}".format('folium'))
    import folium
    
# import wikipedia if cant install then import
try:
    import wikipedia as wp
except:
    !pip install wikipedia
    print("installed {}".format('wikipedia'))
    import wikipedia

# zip code stuff
try:
    import uszipcode
except:
    !pip install uszipcode
    print("installed {}".format('uszipcode'))
    import uszipcode
    
from uszipcode import Zipcode
from uszipcode import SearchEngine
search = SearchEngine(simple_zipcode=True)

# graph imports 
try:
    import geopy 
    from geopy.geocoders import Nominatim
except:
    !pip install geopy
    print("installed {}".format('geopy'))
    import geopy 
    from geopy.geocoders import Nominatim

# learn imports
try:
    import seaborn as sns
except:
    !pip install seaborn
    print("installed {}".format('seaborn'))
    import seaborn as sns
    
from sklearn.cluster import KMeans


### Math imports 

In [194]:
from statistics import mode
from statistics import mean
from statistics import median

### Math Functions

In [204]:
# gets the median
def getMedian(val):
    out = median(val)
    return out

# gets the mode
def getMode(val):
    out = mode(val)
    return out

# gets the mean
def getMean(val):
    out = mean(val)
    return out

# get the average
def getAverage(val):
    count = len(val)-1
    added = 0
    for v in val:
        added += v
    out = added / count
    return out

In [206]:
test1 = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]

getMedian(test1)

8.5

### Declare City List

In [38]:
"""
Get the 4 surrounding counties and 4 of the biggest cities in the state to get
a model to train the data to predict the growth of population over than next 10 years
"""
# this city list is the live data
city_list = [
    'Baltimore',
    'Baltimore County',
    'Richmond',
    'Henrico County',
    'Phoenix',
    'Maricopa County',
    'Houston',
    'Harris County'
]

# this city list is the test and train data
city_list_other = {
    'Baltimore':{
        'Counties':['Prince Georges', 'Calvert'],
        'Counties_cords':[[38.83, -76.85], [38.53, -76.53]],
        'Counties_wiki_page':["Prince George's County, Maryland", 'Calvert County, Maryland'],
        'Counties_pop_table_num':[4, 3],
        'Cities':['Columbia', 'Germantown'],
        'Cities_cords':[[39.203611, -76.856944], [39.183333, -77.266667]],
        'Cities_wiki_page':['Columbia, Maryland', 'Germantown, Maryland'],
        'Cities_pop_table_num':[2, 2],
        'State': 'Maryland'
    },
    "Richmond":{
        'Counties':['Fairfax', 'Alexandria'],
        'Counties_cords':[[38.83, -77.28], [38.804722, -77.047222]],
        'Counties_wiki_page':['Fairfax County, Virginia', 'Alexandria, Virginia'],
        'Counties_pop_table_num':[6, 1],
        'Cities':['Virginia Beach', 'Norfolk'],
        'Cities_cords':[[36.85, -75.977778], [36.916667, -76.2]],
        'Cities_wiki_page':['Virginia Beach, Virginia', 'Norfolk, Virginia'],
        'Cities_pop_table_num':[3, 2],
        'State': 'Virginia'
    },
    "Phoenix":{
        'Counties':['La Paz', 'Yuma'],
        'Counties_cords':[[33.840278, -113.942778], [32.786944, -113.982778]],
        'Counties_wiki_page':['La Paz County, Arizona', 'Yuma County, Arizona'],
        'Counties_pop_table_num':[1, 3],
        'Cities':['Tucson', 'Mesa'],
        'Cities_cords':[[32.221667, -110.926389], [33.422222, -111.822778]],
        'Cities_wiki_page':['Tucson, Arizona', 'Mesa, Arizona'],
        'Cities_pop_table_num':[2, 4],
        'State': 'Arizona'
    },
    "Houston":{
        'Counties':['Brazoria', 'Montgomery'],
        'Counties_cords':[[29.17, -95.44], [30.3, -95.5]],
        'Counties_wiki_page':['Brazoria County, Texas', 'Montgomery County, Texas'],
        'Counties_pop_table_num':[1, 1],
        'Cities':['Dallas', 'Arlington'],
        'Cities_cords':[[32.779167, -96.808889], [32.705, -97.122778]],
        'Cities_wiki_page':['Dallas', 'Arlington, Texas'],
        'Cities_pop_table_num':[5, 2],
        'State': 'Texas'
    }
}

city_list_other = pd.DataFrame.from_dict(city_list_other) #, orient='index')

In [40]:
city_list_other['Baltimore'].keys()


Index(['Cities', 'Cities_cords', 'Cities_pop_table_num', 'Cities_wiki_page',
       'Counties', 'Counties_cords', 'Counties_pop_table_num',
       'Counties_wiki_page', 'State'],
      dtype='object')

### Get Testing Data Function

In [282]:
"""
wikiName = name of the wiki page, is a string
dataL = the data location of the table, is a int
"""
def getPop(wikiName, dataL):
    s = "1234567890.%" # this gets rid of weird chars
    s2 = '1234567890'
    html = wp.page(wikiName).html().encode('UTF-8')
    print('Got the {} page'.format(wikiName))
    
    df = pd.read_html(html)[dataL]
    print(df.keys())
    k = df.keys()
    
    cen = df['Historical population']['Census'] # census year
    pop = df['Historical population']['Pop.'] # population
    perc = df['Historical population']['%Â±'] # percent change
    
    # NDF = new dataframe
    NDF = {
        'Census': cen,
        '{}-pop'.format(wikiName): pop,
        'Percent': perc 
    }

    # convert to a better dataframe
    dataOut = pd.DataFrame(NDF, columns = ["Census", '{}-pop'.format(wikiName), "Percent"])
    
    # delete the first and last row
    dataOut = dataOut.drop([len(dataOut)-1])
    dataOut = dataOut.drop([0])

    printable = set(string.printable)
    
    for index, row in dataOut.iterrows():
        row['Percent'] = singleToFloat(''.join(filter(lambda x: x in printable, row['Percent'])))
        if type(row['Census']) != int:
            row['Census'] = row['Census'].strip('Est. ')
    
    return dataOut

# does the same thing as the for loop above but before its added to the DF
def stripAndFix(per):
    print(per)
    s = "1234567890.%" # this gets rid of weird chars
    printable = set(string.printable)
    
    out = []
    
    for p in per:
        out.append(''.join(filter(lambda x: x in printable, p)))
    return out

### Get the Testing Data: Baltimore

In [368]:
# list format == [[wiki_page, location], ... ]

ALL_DFs = [] # all dataframe will be added to this list.

t1 = city_list_other['Baltimore']['Counties_wiki_page']
t2 = city_list_other['Baltimore']['Counties_pop_table_num']
t3 = city_list_other['Baltimore']['Cities_wiki_page']
t4 = city_list_other['Baltimore']['Cities_pop_table_num']

ml =[
    [t1[0], t2[0]],
    [t1[1], t2[1]],
    [t3[0], t4[0]],
    [t3[1], t4[1]],
]

for m in ml:
    tDF = getPop(m[0],m[1])
    ALL_DFs.append(tDF)

# all population names will be wiki-page-pop

Got the Prince George's County, Maryland page
MultiIndex(levels=[['Historical population'], ['%Â±', 'Census', 'Pop.', 'Unnamed: 2_level_1']],
           codes=[[0, 0, 0, 0], [1, 2, 3, 0]])
Got the Calvert County, Maryland page
MultiIndex(levels=[['Historical population'], ['%Â±', 'Census', 'Pop.', 'Unnamed: 2_level_1']],
           codes=[[0, 0, 0, 0], [1, 2, 3, 0]])
Got the Columbia, Maryland page
MultiIndex(levels=[['Historical population'], ['%Â±', 'Census', 'Pop.', 'Unnamed: 2_level_1']],
           codes=[[0, 0, 0, 0], [1, 2, 3, 0]])
Got the Germantown, Maryland page
MultiIndex(levels=[['Historical population'], ['%Â±', 'Census', 'Pop.', 'Unnamed: 2_level_1']],
           codes=[[0, 0, 0, 0], [1, 2, 3, 0]])


In [278]:
ALL_DFs[0]

Unnamed: 0,Census,"Prince George's County, Maryland-pop",Percent
1,1800,21175,0.008
2,1810,20589,0.028
3,1820,20216,0.018
4,1830,20474,0.013
5,1840,19539,0.046
6,1850,21549,0.103
7,1860,23327,0.083
8,1870,21138,0.094
9,1880,26451,0.251
10,1890,26080,0.014


### Get the Testing Data: Richmond

In [369]:
t1 = city_list_other['Richmond']['Counties_wiki_page']
t2 = city_list_other['Richmond']['Counties_pop_table_num']
t3 = city_list_other['Richmond']['Cities_wiki_page']
t4 = city_list_other['Richmond']['Cities_pop_table_num']

ml =[
    [t1[0], t2[0]],
    [t1[1], t2[1]],
    #[t3[0], t4[0]],
    [t3[1], t4[1]]
]

for m in ml:
    tDF = getPop(m[0],m[1])
    ALL_DFs.append(tDF)

# all population names will be wiki-page-pop
body = client_b8dd31888675478381adf80bd9d2a977.get_object(Bucket='battleofneighborhoodsrestaurants-donotdelete-pr-rr6vu0kgfnvrbj',Key='norfolk2_pop.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

norfolk = pd.read_csv(body)
ALL_DFs.append(norfolk)

Got the Fairfax County, Virginia page
MultiIndex(levels=[['Historical population'], ['%Â±', 'Census', 'Pop.', 'Unnamed: 2_level_1']],
           codes=[[0, 0, 0, 0], [1, 2, 3, 0]])
Got the Alexandria, Virginia page
MultiIndex(levels=[['Historical population'], ['%Â±', 'Census', 'Pop.', 'Unnamed: 2_level_1']],
           codes=[[0, 0, 0, 0], [1, 2, 3, 0]])
Got the Norfolk, Virginia page
MultiIndex(levels=[['Historical population'], ['%Â±', 'Census', 'Pop.', 'Unnamed: 2_level_1']],
           codes=[[0, 0, 0, 0], [1, 2, 3, 0]])


### Get the Testing Data: Phoenix

In [370]:
t1 = city_list_other['Phoenix']['Counties_wiki_page']
t2 = city_list_other['Phoenix']['Counties_pop_table_num']
t3 = city_list_other['Phoenix']['Cities_wiki_page']
t4 = city_list_other['Phoenix']['Cities_pop_table_num']

ml =[
    [t1[0], t2[0]],
    [t1[1], t2[1]],
    [t3[0], t4[0]],
    [t3[1], t4[1]]
]

for m in ml:
    tDF = getPop(m[0],m[1])
    ALL_DFs.append(tDF)

# all population names will be wiki-page-pop

Got the La Paz County, Arizona page
MultiIndex(levels=[['Historical population'], ['%Â±', 'Census', 'Pop.', 'Unnamed: 2_level_1']],
           codes=[[0, 0, 0, 0], [1, 2, 3, 0]])
Got the Yuma County, Arizona page
MultiIndex(levels=[['Historical population'], ['%Â±', 'Census', 'Pop.', 'Unnamed: 2_level_1']],
           codes=[[0, 0, 0, 0], [1, 2, 3, 0]])
Got the Tucson, Arizona page
MultiIndex(levels=[['Historical population'], ['%Â±', 'Census', 'Pop.', 'Unnamed: 2_level_1']],
           codes=[[0, 0, 0, 0], [1, 2, 3, 0]])
Got the Mesa, Arizona page
MultiIndex(levels=[['Historical population'], ['%Â±', 'Census', 'Pop.', 'Unnamed: 2_level_1']],
           codes=[[0, 0, 0, 0], [1, 2, 3, 0]])


### Get the Testing Data: Houston

In [374]:
t1 = city_list_other['Houston']['Counties_wiki_page']
t2 = city_list_other['Houston']['Counties_pop_table_num']
t3 = city_list_other['Houston']['Cities_wiki_page']
t4 = city_list_other['Houston']['Cities_pop_table_num']

ml =[
    [t1[0], t2[0]],
    [t1[1], t2[1]],
    [t3[0], t4[0]],
    [t3[1], t4[1]]
]

for m in ml:
    tDF = getPop(m[0],m[1])
    ALL_DFs.append(tDF)

# all population names will be wiki-page-pop

Got the Brazoria County, Texas page
MultiIndex(levels=[['Historical population'], ['%Â±', 'Census', 'Pop.', 'Unnamed: 2_level_1']],
           codes=[[0, 0, 0, 0], [1, 2, 3, 0]])
Got the Montgomery County, Texas page
MultiIndex(levels=[['Historical population'], ['%Â±', 'Census', 'Pop.', 'Unnamed: 2_level_1']],
           codes=[[0, 0, 0, 0], [1, 2, 3, 0]])
Got the Dallas page
MultiIndex(levels=[['Historical population'], ['%Â±', 'Census', 'Pop.', 'Unnamed: 2_level_1']],
           codes=[[0, 0, 0, 0], [1, 2, 3, 0]])
Got the Arlington, Texas page
MultiIndex(levels=[['Historical population'], ['%Â±', 'Census', 'Pop.', 'Unnamed: 2_level_1']],
           codes=[[0, 0, 0, 0], [1, 2, 3, 0]])


### Clustering Functions

In [150]:
def makeCluster(cl):
    header = ['Census']
    hp2 = ['Percent1', "Percent2", 'Percent3', 'Percent4']
    keys = [] # takes the column name from the population
    keyObjects = [] # will take the list from the population dataframe
    difPer = [] # all the different percent changes
    
    for c in cl:
        tk = c.keys()
        header.append(tk[1])
        keys.append(tk[1])
        temppop = c[tk[1]] # this migh need to be changed to a index val
        tempper = c['Percent']
        keyObjects.append(temppop)
        difPer.append(tempper)
        
    for h in hp2:
        header.append(h)
        
    CL = {
        'Census': list(cl[0]['Census']), # this is uniform
        header[1]: keyObjects[0],
        header[2]: keyObjects[1],
        header[3]: keyObjects[2],
        header[4]: keyObjects[3],
        hp2[0]: difPer[0],
        hp2[1]: difPer[1],
        hp2[2]: difPer[2],
        hp2[3]: difPer[3]
    }
    
    CL = pd.DataFrame(CL, columns = header)
    return CL
    

### Clean the Data OLD

In [358]:
# this function just gets the common range || as a string
def makeYearRangeSTR():
    start = 1800
    end = 2020
    nums = []
    for i in range(start, end, 10):
        nums.append(str(i))
    nums.append('2019')
    return nums

# this function just gets the common range || as a int
def makeYearRangeINT():
    start = 1800
    end = 2020
    nums = []
    for i in range(start, end, 10):
        nums.append(i)
    nums.append('2019')
    return nums

def getListOfZeroI(): # ints
    s = 0
    e = 23
    z = []
    for i in range(s,e, 1):
        z.append(0)
    return z

def getListOfZeroS(): # strings
    s = 0
    e = 23
    z = []
    for i in range(s,e, 1):
        z.append(str(0))
    return z

def getListOfZeroF(): # float
    s = 0
    e = 23
    z = []
    for i in range(s,e, 1):
        z.append(float(0.0))
    return z

def turnToFloat(per): # turns the value to a decimial, list
    convert = []
    for p in per:
        p = p.strip('%')
        convert.append(float(p)/100)
    return convert # set this equal in a for loop

def singleToFloat(val): # turns the value to a decimial, single val
    val = val.strip('%')
    return float(val)/100

def getChangePer(per): # get the percent change
    index = []
    for i in range(1, len(per)-1):
        index.append(i)
        
    periods = 23 - len(per)
    per = {
        #'index': index,
        'Percent': per
    }
    per = pd.DataFrame(per, columns=['Percent'])
    print(per.pct_change(periods = periods, axis=1))

    
def addMissingYear(df): # does this in the Census column
    oldL = list(df['Census'])
    refY = makeYearRangeSTR()
    toAdd = np.setdiff1d(refY, oldL)
    
    # get index locations
    IL = []
    for a in list(oldL):
        IL.append(refY.index(a))
    
    #print(toAdd)
    #print(IL)
    return toAdd, IL, refY

def cleanData1(cl): # takes the data set and normalizes 
    yearRange = makeYearRangeSTR()
    header = ['Census']
    hp2 = ['Percent1', "Percent2", 'Percent3', 'Percent4']
    keys = [] # takes the column name from the population
    keyObjects = [] # will take the population dataframe
    difPer = [] # all the different percent changes
    yr = [] # this will take all the years that the census took place
    
    tk = cl.keys()
    for c in cl:
        
        #header.append(tk[1])
        #keys.append(tk[1])
        temppop = c[1] # this migh need to be changed to a index val
        tempper = c[2]
        yr.append(c[0])
        keyObjects.append(1)
        difPer.append(tempper)
    
    needsClean = False
    x = 0
    #for k1 in yr: # the population, k1 is a ( one )
    if len(yr) != len(yearRange):
        needsClean = True
        #print('index {} has errors\n {} |vs| {}'.format(x, len(yr), len(yearRange)))
        #print('\n the differs \n')
        #print(set(k1) - set(yearRange))
        #print([item for item in yearRange if item not in k1])
        #x += 1

    if needsClean == True:
        process1(cl)

def process1(df):
    toAdd, IL, refY = addMissingYear(df)
    szi = getListOfZeroI()
    szs = getListOfZeroS()
    szf = getListOfZeroF()
    
    keys = list(df.keys())
    print(type(df))
    pop = df[key[1]]
    per = df[k]
    
    l1 = szs
    l2 = szi # pop
    l3 = szf # percent
    
    x = 0
    for i in IL:
        l2[i] = pop[x]
        if type(per[x]) == float:
            l3[i] = singleToFloat(per[x])
        else:
            l3[i] = per[x]
        x += 1
    
    print(l2, '\n', l3)

In [207]:
print(len(makeYearRangeSTR()))

23


### Create Clusters 

In [371]:
# baltimore cluster
BC = ALL_DFs[0:1]
#BC.append(ALL_DFs[])

# richmand cluster
RC = ALL_DFs[4:7]

# phoenix cluster
PC = ALL_DFs[8:12]

# Houston cluster
HC = ALL_DFs[12:16]

In [376]:
x = 0
gd = [] # good data
addL = [0, 1, 4, 5, 6, 7, 9, 10, 12, 13, 15]
print(len(addL))
for a in ALL_DFs:
    #print('{} || {}'.format(x, len(a)))
    #print('\n\n\n')
    #print(a)
    if x in addL:
        gd.append(a)
    x += 1
    
print(len(gd))

11
11


### TESTING

In [None]:
t1 = ALL_DFs[0:4]
for t in t1:
    cleanData1[t]

In [359]:
#k = ALL_DFs[0].keys()
#BC[1]['Census']
#list(BC[0]['Census'])
#k[1]
#makeCluster(BC)
#BC[1]['Census']
#BC[0].head()
#for b in BC:
#    print(b)
#getChangePer(list(BC[2]['Census']))
#type(list(BC[1]['Census']))
#t = list(BC[2]['Percent'])
temp = ALL_DFs[2]
cleanData1(temp)
#temp
#toAdd, IL, refY = addMissingYear(temp)
#print(temp['Census'])
#temp['Census'] = refY
#addZeros(temp, IL, toAdd)
#IL

<class 'pandas.core.frame.DataFrame'>


KeyError: 1

### Load Baltimore Population

In [9]:
import types
import pandas as pd
from botocore.client import Config
import ibm_boto3

def __iter__(self): return 0

# @hidden_cell
# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share the notebook.
client_b8dd31888675478381adf80bd9d2a977 = ibm_boto3.client(service_name='s3',
    ibm_api_key_id='rEw1LED0Rb2Cxl_9pbLj1c8XzOq1v7pwSa_vOCrhYiVu',
    ibm_auth_endpoint="https://iam.cloud.ibm.com/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url='https://s3-api.us-geo.objectstorage.service.networklayer.com')

body = client_b8dd31888675478381adf80bd9d2a977.get_object(Bucket='battleofneighborhoodsrestaurants-donotdelete-pr-rr6vu0kgfnvrbj',Key='balt_pop.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

balt_pop = pd.read_csv(body)
#balt_pop.head()

### Load Richmond Population

In [10]:
body = client_b8dd31888675478381adf80bd9d2a977.get_object(Bucket='battleofneighborhoodsrestaurants-donotdelete-pr-rr6vu0kgfnvrbj',Key='rich_pop.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

rich_pop = pd.read_csv(body)
#rich_pop.head()

### Load Houston Population

In [12]:
body = client_b8dd31888675478381adf80bd9d2a977.get_object(Bucket='battleofneighborhoodsrestaurants-donotdelete-pr-rr6vu0kgfnvrbj',Key='hou_pop.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

hou_pop = pd.read_csv(body)
#hou_pop.head()

### Load Phoenix Population

In [14]:
body = client_b8dd31888675478381adf80bd9d2a977.get_object(Bucket='battleofneighborhoodsrestaurants-donotdelete-pr-rr6vu0kgfnvrbj',Key='phx_pop.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

phx_pop = pd.read_csv(body)
#phx_pop.head()