# Start by getting the county information

All of my data is linked by the FIPS identifying code for each county.  
We will be using the FIPS data for plotting and id of county bounds and as a defacto index across all datasets.

We will also be using the county name for additional identification and the census tract area for each county.
The census area is the surveyed area for census purposes (where people live) within the bounds of the county.

In [19]:
# Get the county fips codes
import pandas as pd
import pickle
import json
import os
import sys
import geopandas as gpd



us_county_path = 'datasets/county_boundaries.json'

cur_json = json.load(open(us_county_path, encoding='ISO-8859-1'))
path,ext = os.path.splitext(us_county_path)
new_path = path+"_new"+ext

with open(new_path,"w", encoding='utf-8') as jsonfile:
    json.dump(cur_json,jsonfile,ensure_ascii=False)

us_county = gpd.read_file(new_path, driver='GeoJSON')

us_county['fips'] = us_county['STATE'] + us_county['COUNTY']
us_county = us_county[us_county['STATE'].apply(int) < 57]


NameError: name 'load_county' is not defined

In [2]:
us_county_df = pd.DataFrame(us_county[['STATE', 'COUNTY', 'NAME', 'CENSUSAREA','fips']])
us_county_df.head()

Unnamed: 0,STATE,COUNTY,NAME,CENSUSAREA,fips
0,1,1,Autauga,594.436,1001
1,1,9,Blount,644.776,1009
2,1,17,Chambers,596.531,1017
3,1,21,Chilton,692.854,1021
4,1,33,Colbert,592.619,1033


## 2018 Midterm Election Data

The election data is taken from https://electionlab.mit.edu/data.
The data has results for every election, both state and national.  It includes the vote total for each candidate and the candidate's party.

We will use the fips code for each county to act as an index for the more than 3000 counties contained in the dataset.  Fips codes have a two digit state id followed by a three digit county id code within the state.  States are listed in alphabetical order.  The dataset also contains information on territories (Guam, NMI, PR, etc.) but that will be filtered out to focus on the contiguous United States and Hawaii.  The burroughs of Alaska are unfortunately not part of this dataset, although after looking at the frequency of stores in Alaska, it might  not be useful to add to the model as many of these stores are not present or sparse in our 49th state.  District of Columbia is unfortunately not included.  Adding the Alaskan burroughs and DC would be a recommendation for further study.

In [7]:
# read in election results from csv
df = pd.read_csv('datasets/county_2018.csv', encoding='latin') 
# There was an encoding error that prevented the dataset from importing properly.  
# It was imported, written back as a csv and imported again without problems.

# Eliminates territories (past Wyoming last alphabetically)
df = df[df['state_fips'].apply(int) < 57]
df['state_fips'].unique()


array([ 1,  4,  5,  6,  8,  9, 10, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22,
       23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
       40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 51, 53, 54, 55, 56])

### Narrow the election data 
I chose to go with only the US House races.  
Senators tend to have long term support statewide (in some states) and might influence the model.

Further investigation: you could look at all races cumulative to determine red or blue counties.
Many local elections are decided cross party, unlike congressional elections.


In [15]:
# remove all lines that are not republican or democrat
df.party.unique() # two party data only, there are lots of them 
df2 = df[(df['party']=='republican') | (df['party']=='democrat')]  # df2 is two party only
df2.party.value_counts()


# I would like to see if model perfoms better on national elections.  
# Might show partisan lean better than state and local
all_offices = ['Associate Justice of the Supreme Court, Place 1',
       'Associate Justice of the Supreme Court, Place 2',
       'Associate Justice of the Supreme Court, Place 3',
       'Associate Justice of the Supreme Court, Place 4',
       'Attorney General', 'Chief Justice of the Supreme Court',
       'Commissioner of Agriculture and Industries', 'Governor',
       'Lieutenant Governor', 'Public Service Commission, Place 1',
       'Public Service Commission, Place 2', 'Secretary of State',
       'State Auditor', 'State Representative', 'State Senator',
       'State Treasurer', 'US Representative',
       'State Board of Education Member', 'Corporation Commissioner',
       'State Mine Inspector', 'Superintendant of Public Instruction',
       'US Senator', 'Auditor of State', 'Commissioner of State Lands',
       'State Senate', 'Board of Equalization Member', 'Controller',
       'Insurance Commissioner', 'State Assembly Member', 'Treasurer',
       'Governor/Lieutenant Governor',
       'Regent of the University of Colorado', 'Comptroller',
       'Governor and Lieutenant Governor', 'Secretary of the State',
       'Auditor of Accounts', 'Chief Financial officer',
       'Commissioner of Agriculture', 'State Attorney',
       'Commissioner of Insurance', 'Commissioner of Labor',
       'Public Service Commission, District 3 - Metro-Atlanta',
       'Public Service Commission, District 5 - Western',
       'State School Superintendent', 'State Controller',
       'State Representative A', 'State Representative B',
       'Superintendent of Public Instruction', 'Treasurer of State',
       'Secretary of Agriculture', 'Governor / Lt. Governor',
       'House of Delegates Member', 'Auditor', "Governor's Council",
       'Secretary of the Commonwealth',
       'Member of the State Board of Education',
       'Regent of the University of Michigan',
       'State Representative (Partial Term Ending 01/01/2019)',
       'State Senator (Partial Term Ending 01/01/2019)',
       'US Representative (Partial Term Ending 01/03/2019)',
       'Governor & Lt Governor', 'Auditor of Public Accounts',
       'Governor and Lt. Governor', 'Public Service Commissioner',
       'Executive Council', 'Commissioner of Public Lands',
       'Justice of the Supreme Court', 'Supreme Court Justice',
       'NC Supreme Court Associate Justice Seat 1',
       'Agriculture Commissioner', 'For Attorney General',
       'For Corporation Commissioner', 'For Insurance Commissioner',
       'State Auditor and Inspector', 'General Treasurer',
       'Comptroller General', 'State Superintendent of Education',
       'Commissioner School Public Lands',
       'Public Utilities Commissioner',
       'Commissioner of the General Land office',
       'Comptroller of Public Accounts',
       'Judge, Court of Criminal Appeals Place 7',
       'Judge, Court of Criminal Appeals Place 8',
       'Justice, Supreme Court, Place 2',
       'Justice, Supreme Court, Place 4',
       'Justice, Supreme Court, Place 6',
       'Presiding Judge, Court of Criminal Appeals',
       'Railroad Commissioner', 'Member, State Board of Education',
       'State Representative Pos. 1', 'State Representative Pos. 2',
       'State House Delegate', 'State Assembly Representative']

major_offices = ['Governor',
                 'US Representative',
                 'US Senator',
                 'Governor/Lieutenant Governor',
                 'Governor and Lieutenant Governor',
                 'Governor / Lt. Governor',
                 'US Representative (Partial Term Ending 01/03/2019)',
                 'Governor & Lt Governor', 
                 'Governor and Lt. Governor', 
                 'For Attorney General'
                ]

congress =      [
                 'US Representative',
                 'US Senator',
                 'US Representative (Partial Term Ending 01/03/2019)',
                ]

house = [
                 'US Representative',
                 'US Representative (Partial Term Ending 01/03/2019)',
        ]

# # Put in the data you wish to include in the model.  You could use congress, major, or all (I chose house)
df_temp = df2.copy()
#df2 = df_temp[(df_temp['office'].isin(all_offices)]
df2 = df2[df2['office'].isin(all_offices)]



# eliminate unnecessary columns
df2 = df2[['state', 'county', 'state_fips', 'party', 'candidatevotes', 'totalvotes', 'office']]

df2.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 110492 entries, 6 to 177663
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   state           110492 non-null  object 
 1   county          110482 non-null  object 
 2   state_fips      110492 non-null  int64  
 3   party           110492 non-null  object 
 4   candidatevotes  110058 non-null  float64
 5   totalvotes      108089 non-null  float64
 6   office          110492 non-null  object 
dtypes: float64(2), int64(1), object(4)
memory usage: 6.7+ MB


In [18]:
df.party.unique() # two party data only, there are lots of them 


array([nan, 'republican', 'democrat', 'libertarian', 'independent',
       'green', 'nonpartisan', 'no party preference', 'unity',
       'american constitution', 'approval voting', 'unaffiliated',
       'working families', 'amigo constitution liberty',
       'griebel frank for ct', 'petitioning candidate',
       'no party affiliation', 'reform party of florida',
       'libertarian party of florida', 'green party', 'constitution',
       'conservative', 'downstate united', 'clear water',
       'legal medical now', 'no party', 'green independent',
       'candid common sense', 'common sense independent',
       "people's unenrolled independent", 'independent for maine',
       'unenrolled', 'maine socialist party', 'green-rainbow',
       'massachusetts independent', 'second american revolution',
       'cooperative green economy', 'independent and veteran',
       'independent progressive', 'no  affiliation', 'us taxpayers',
       'natural law', 'working class', 'democratic-farme

In [17]:
# We decided to just count up all of the votes for each party in every election.
# If we chose to include local elections or major elections, this would
# weight congressional elections the same as local, but is aimed at getting a sense of how much
# the county leans red or blue without individual politicians affecting the categorization
df2_grouped = df2.groupby(by=['state_fips', 'county', 'party']).sum().reset_index()

# This drops out territories by chopping off everything after Wyoming
df2_grouped = df2_grouped[df2_grouped['state_fips'].apply(int) < 57]
df2_grouped.info()

# we are left with vote totals for GOP and Dem for each county


<class 'pandas.core.frame.DataFrame'>
Int64Index: 6098 entries, 0 to 6097
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   state_fips      6098 non-null   int64  
 1   county          6098 non-null   object 
 2   party           6098 non-null   object 
 3   candidatevotes  6098 non-null   float64
 4   totalvotes      6098 non-null   float64
dtypes: float64(2), int64(1), object(2)
memory usage: 285.8+ KB


## Cleaning up the county data
County naming is not standardized.  There were lots of exceptions in the more than 3000 counties that had to be addressed.  The data in the us_county dataset was not named the same as election data.  Each .replace was a manual change to get the two datasets to conform.  The election data did not have FIPS data initially included.  That is unfortunate.

In [None]:
# Make a new col named fips which is initially filled with zeroes
df2_grouped_fips = df2_grouped.copy()
df2_grouped_fips['fips'] = pd.Series([0 for x in range(len(df2_grouped_fips.index))])  


# The code below ensures that county names match exactly so we can populate the fips column
# this will make fips a key used for election, county, and population datasets
us_county_df['NAME'] = us_county_df['NAME'].apply(lambda x: x.upper())
us_county_df['NAME'] = us_county_df['NAME'].apply(lambda x: x.replace('.', '')
                                                              .replace("''", '')
                                                              .replace("DE WITT", 'DEWITT')
                                                              .replace('LA SALLE', 'LASALLE')
                                                              .replace("DE KALB", 'DEKALB')
                                                              .replace(" CITY", '')
                                                              .strip())




df2_grouped_fips['county'] = df2_grouped_fips['county'].apply(lambda x: x.upper())
df2_grouped_fips['county'] = df2_grouped_fips['county'].apply(lambda x: x.replace('COUNTY', '')
                                                                          .replace('.', '')
                                                                          .replace(' CITY', '')
                                                                          .replace('MEEER', 'MEEKER')
                                                                          .replace('JODAVIESS', 'JO DAVIESS')
                                                                          .replace('&', 'AND')
                                                                          .replace('DE WITT', 'DEWITT')
                                                                          .replace('DE KALB', 'DEKALB')
                                                                          .replace('LAC QUI PARTE', 'LAC QUI PARLE')
                                                                          .replace('OGLALA LAKOTA', 'OGLALA')
                                                                          .replace('CHENAGO', 'CHENANGO')
                                                                          .replace('LA SALLE', 'LASALLE')
                                                                          .replace('DONA ANA', 'DOÑA ANA')
                                                                          .strip())

# These are bad county names we dropped
# They inclued UOCAVA (overseas votes), Oglala Lakota and some other NON-COUNTY data
df2_grouped_fips = df2_grouped_fips.loc[~df2_grouped_fips['county'].isin(['STATE TOTALS', 
                                                                            'STATE UOCAVA', 
                                                                            'TOTAL VOTES BY CANDIDATE',
                                                                            'TOTAL VOTES BY PARTY',
                                                                            'FEDERAL PRECINCT',
                                                                            'KANSAS',
                                                                            'OGLALA',
                                                                        ])]

# Force numperic values 
us_county_df.info()
df2_grouped_fips.info()
us_county_df["STATE"] = pd.to_numeric(us_county_df["STATE"])
us_county_df["fips"] = pd.to_numeric(us_county_df["fips"])




In [None]:
# I used this loop to go back and correct the county names between the two datasets
errors = 0  # track the dumped counties

for i in range(len(df2_grouped_fips)):
    state = int(df2_grouped_fips.iloc[i, :]['state_fips'])  # get state number (1 to 57 numeric)
    county = df2_grouped_fips.iloc[i, :]['county'].strip() # get county name
    #print(us_county_df.loc[us_county_df['NAME'] == county].iloc[-1][-1])
    try:
        # try to associate add the approprate fips for each county
        fip = us_county_df.loc[(us_county_df['NAME']==county) & (us_county_df['STATE']==state)].iloc[-1][-1]
        df2_grouped_fips.iloc[i, -1] = fip  # BE CAREFUL HERE
    except:
        # if it didn't work, print it out for troubleshooting
        print(county, state, i)
        errors +=1
        print()
        

print(errors)


In [None]:
df_final = df2_grouped_fips.copy().reset_index()  # failed at index 414, index was missing for agg(idxmax) (Fixed)


# get only highest (DEM or GOP)
# This eliminates the 'loser' of each county (this is aggregate votes, not individual elections)
df_final = df_final.iloc[df_final.groupby('fips')['candidatevotes'].agg(pd.Series.idxmax)]
#df2_final = df2_final.iloc[df2_final.groupby('fips')['candidatevotes'].idxmax().values.ravel()]

df_final.describe()


# FINAL TALLY
# US HAS 3141 total counties.

# Missing counties
# DC has no counties.  Not sure how to handle that
# MISSING OGLALA LAKOTA county (Native American lands)
# MISSING the 19 ALASKA buroughs data 
# MAY BE MISSING MORE COUNTY EQUIVALENTS

# Data is missing for Iowa US Congressional race, must use all offices instead.  bummer

In [None]:
#!pip install geopandas

# Create a county election map
County boundaries found at https://eric.clst.org/tech/usgeojson/
There was an encoding error which was fixed using instructions from the page.

In [None]:
# import pickle
# import json
# import os
# import geopandas as gpd

# us_county_path = 'datasets/county_boundaries.json'

# cur_json = json.load(open(us_county_path, encoding='ISO-8859-1'))
# path,ext = os.path.splitext(us_county_path)

# new_path =path+"_new"+ext
# with open(new_path,"w", encoding='utf-8') as jsonfile:
#     json.dump(cur_json,jsonfile,ensure_ascii=False)

# us_county = gpd.read_file(new_path, driver='GeoJSON')

# us_county['fips'] = us_county['STATE'] + us_county['COUNTY']


# type(us_county)


In [None]:
df2_final = df_final.copy()



# MAKE MY FIPS COMPATIBLE WITH THE GEODATA
df2_final['fips'] = df2_final['fips'].apply(lambda x: "{:05}".format(x))

df2_final['blue'] = df2_final['party'].apply(lambda x: 0 if (x=='republican') else 1)

In [None]:
df2_final.info()


In [None]:
# import folium
# import json
# import numpy as np

# center = [37.0373, -95.6164]
# bins = list(df2_final['candidatevotes'].quantile([0, 0.2, 0.4, 0.6, 0.8, 1]))  # <<<<<<< ADD THIS LINE


# # Initialize Folium Map again (same as before)
# m = folium.Map(location=center, 
#                zoom_start=5,
#                tiles='Stamen Toner')


# # Create choropleth map  
# folium.Choropleth(
#     geo_data=us_county,
#     name='choropleth',
#     data=df2_final,
#     key_on='feature.properties.fips',
#     columns=['fips', 'blue'],
#     fill_color='Spectral',
#     fill_opacity=0.5,
#     nan_fill_opacity=0.5,
#     line_opacity=1,
#     legend_name='2018 Midterm Election',
#     us
    
# ).add_to(m)


# m.save('county_choropleth.html')

In [None]:
us_county.head()
#!pip install plotly

In [None]:
#us_county.to_file("datasets/counties_fixed.geojson", driver='GeoJSON')


In [None]:
# import json
# import plotly.express as px
# from urllib.request import urlopen


# # with open("datasets/counties_fixed.geojson") as f:
# #     us_county_fix = json.load(f)

# with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
#     us_county_fix = json.load(response)
    
# fig = px.choropleth(df2_final, 
#                     geojson=us_county_fix, 
#                     locations='fips', 
#                     color='party',
#                     color_discrete_sequence=px.colors.qualitative.Set1,
#                     #color_continuous_scale=["red", 'blue'],
#                     #range_color=(0, 1),
#                     scope="usa",
#                     hover_name='county'
#                     )
#fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
#fig.show()
          
# don't have fips in yet

In [None]:
df2_final.describe()
#df2_final.head()
#df2_final.columns


All datasets are stored in the /datasets folder which contains:
- academy.csv			
- county_boundaries_new.json	
- grainger.csv			
- rei.csv				
- walmart.pkl
- basspro.csv			
- county_fips_master.csv		
- harley.csv			
- starbucks.csv			
- wholefoods.csv
- chickfila.csv			
- crackerbarrel.pkl		
- hm.pkl				
- store_df.pkl
- counties_fixed.geojson		
- dicks.csv			
- hobbylobby.pkl			
- target.csv
- county_2018.csv			
- district_geojson.pkl		
- llbean.csv			
- tractorsupply.csv
- county_boundaries.json		
- dollartree.csv			
- potterybarn.csv			
- traderjoes.csv

In [None]:



# Add in a chain stores
# COMMENTED OUT CODE TO IMPORT STORES.  TRYING TO DO IT ONE AT A TIME
# Each dataset was a little bit different, so I decided not to automate with functions

# # WHOLE FOODS 
# wf_df = pd.read_csv('datasets/wholefoods.csv', encoding='latin')
# # wf_df.to_csv('datasets/wholefoods.csv')  # fixing the encoding issue
# wf_df['longlat'] = list(zip(wf_df['long'], wf_df['lat']))
# wf_df['points'] = wf_df['longlat'].apply(make_point)


# # TRACTOR SUPPLY
# ts_df = pd.read_csv('datasets/tractorsupply.csv')
# ts_df.info()
# ts_df = add_longlat(ts_df)
# ts_df.describe()


# # STARBUCKS
# sb_df = pd.read_csv('datasets/starbucks.csv')
# sb_df = sb_df.rename(columns={'Longitude': "long", "Latitude":'lat'})
# sb_df = sb_df[sb_df['Country']=='US']
# sb_df = add_longlat(sb_df)


# # ACADEMY SPORTS
# as_df = pd.read_csv('datasets/academy.csv')
# as_df = add_longlat(as_df)
# as_df.describe()

# # DICK'S SPORTS
# ds_df = pd.read_csv('datasets/dicks.csv')
# ds_df = add_longlat(ds_df)
# ds_df.describe()


# # BASS PRO SHOPS
# bp_df = pd.read_csv('datasets/basspro.csv')
# bp_df = add_longlat(bp_df)
# bp_df.describe()

# # DOLLAR TREE STORE
# dt_df = pd.read_csv('datasets/dollartree.csv')
# dt_df = add_longlat(dt_df)
# dt_df.describe()

# # GRAINGER STORE
# grg_df = pd.read_csv('datasets/grainger.csv')
# grg_df.info()
# grg_df = add_longlat(grg_df)  # this one had a bunch of garbage and extra commas
# grg_df.describe()


# # CHICK FIL A
# cf_df = pd.read_csv('datasets/chickfila.csv', encoding='latin')
# cf_df.info()
# #cf_df.to_csv('datasets/chickfila.csv')
# #time.sleep(5)
# #cf_df = pd.read_csv('datasets/chickfila.csv', encoding='latin')
# cf_df = add_longlat(cf_df)  # format was wrong, had to resave as utf-8
# cf_df.describe()


# # CRACKER BARREL
# # This one was pickled from scraping
# with open('datasets/crackerbarrel.pkl','rb') as f:
#     cb_df = pickle.load(f)

# cb_df = pd.DataFrame(cb_df)
# cb_df.columns = ['long', 'lat']
# cb_df = add_longlat(cb_df)
# cb_df.info()


# # HARLEY DAVIDSON
# hd_df = pd.read_csv('datasets/harley.csv', encoding='latin')
# #dt_hd.to_csv('datasets/harley.csv')  # did this to fix encoding
# hd_df = add_longlat(hd_df)
# hd_df.describe()


# # H&M
# # This one was pickled from scraping
# with open('datasets/hm.pkl','rb') as f:
#     hm_df = pickle.load(f)

# hm_df = pd.DataFrame(hm_df)
# hm_df.columns = ['long', 'lat']
# hm_df = add_longlat(hm_df)
# hm_df.info()


# # Hobby Lobby
# # This one was pickled from scraping
# with open('datasets/hobbylobby.pkl','rb') as f:
#     hl_df = pickle.load(f)

# hl_df = pd.DataFrame(hl_df)
# hl_df.columns = ['long', 'lat']
# hl_df = add_longlat(hl_df)
# hl_df.info()

# # LL Bean
# ll_df = pd.read_csv('datasets/llbean.csv')
# ll_df.info()
# ll_df = add_longlat(ll_df)  
# ll_df.describe()


# # Pottery Barn
# pb_df = pd.read_csv('datasets/potterybarn.csv')
# pb_df.info()
# pb_df = add_longlat(pb_df)  
# pb_df.describe()


# # REI
# rei_df = pd.read_csv('datasets/rei.csv')
# rei_df.info()
# rei_df = add_longlat(rei_df)  
# rei_df.describe()


# # Target
# tg_df = pd.read_csv('datasets/target.csv', encoding='latin')
# tg_df = tg_df.rename(columns={'Address.Longitude': "long", "Address.Latitude":'lat'})
# tg_df.info()
# #tg_df.to_csv('datasets/target.csv')  # did this to fix encoding
# tg_df = add_longlat(tg_df)  
# tg_df.describe()


# # Trader Joe's
# tj_df = pd.read_csv('datasets/traderjoes.csv')
# tj_df.info()
# tj_df = add_longlat(tj_df)  
# tj_df.describe()

# # WalMart. 
# # This one was pickled from scraping
# with open('datasets/walmart.pkl','rb') as f:
#     wal_df = pickle.load(f)

# wal_df = pd.DataFrame(wal_df)

# wal_df
# wal_df.columns = ['long', 'lat']
# wal_df = add_longlat(wal_df)
# wal_df.info()




In [None]:
# might not need all of these imports
from shapely.geometry import Polygon, Point, MultiPolygon, shape, GeometryCollection


def get_county(point, geo_df):
    for i in range(len(geo_df)):
        poly = geo_df.iloc[i]['geometry']
        if poly.contains(point):
            return us_county.iloc[i]['fips']
        
def get_stores_by_county(store_df, us_county):
    found = store_df['points'].apply(get_county, args=[us_county])
    return found.value_counts()

# HIGH RESOURCE CODE >>>>>
#wf_counts = get_stores_by_county(wf_df, us_county)  # WHOLE FOODS
#ts_counts = get_stores_by_county(ts_df, us_county)  # TRACTOR SUPPLY STORE
#sb_counts = get_stores_by_county(sb_df, us_county)  # STARBUCKS COFFEE 
#as_counts = get_stores_by_county(as_df, us_county)  # ACADEMY SPORTS  
#ds_counts = get_stores_by_county(ds_df, us_county)  # DICK'S SPORTS
#bp_counts = get_stores_by_county(bp_df, us_county)  # BASS PRO
#dt_counts = get_stores_by_county(dt_df, us_county)  # DOLLAR TREE
#grg_counts = get_stores_by_county(grg_df, us_county)  # GRAINGER
#cf_counts = get_stores_by_county(cf_df, us_county)  # CHICK-FIL-A
#cb_counts = get_stores_by_county(cb_df, us_county)  # CRACKER BARREL
#hd_counts = get_stores_by_county(hd_df, us_county)  # HARLEY DAVIDSON
#hm_counts = get_stores_by_county(hm_df, us_county)  # H and M
#hl_counts = get_stores_by_county(hl_df, us_county)  # HOBBY LOBBY
#ll_counts = get_stores_by_county(ll_df, us_county)  # LL BEAN
#pb_counts = get_stores_by_county(pb_df, us_county)   # POTTERY BARN
#rei_counts = get_stores_by_county(rei_df, us_county)   # REI
#tg_counts = get_stores_by_county(tg_df, us_county)   # TARGET
#tj_counts = get_stores_by_county(tj_df, us_county)   # TRADER JOE'S
#wal_counts = get_stores_by_county(wal_df, us_county)   # WALMART



# USE THIS TO IMPORT FROM PICKLE
with open('datasets/store_df.pkl','rb') as f:
    df_stores = pickle.load(f)

# you can't do this copy until you align the indexes
# df_stores.index = df2_final.index
# df_stores['party'] = df2_final['party']
# df_stores['blue'] = df2_final['blue']
# df_stores.isna().sum()

# df2_final.isna().sum()


# df_stores.info()
# df_stores.describe()

# def is_blue(fips, df):
#     try:
#         blue = df.loc[df['fips']==fips]
#         return blue.value
#     except Exception as e:
#         print(fips, e)

blue = []

for i in range(len(df_stores)):
    f = df_stores.iloc[i,6]
    b = df2_final[df2_final['fips']==str(f)]['blue']
    blue.append(int(b))

#df_stores['fips'].apply(is_blue, args=(df2_final,))  # comma has to be there UGGH
df_stores['blue'] = blue
# df_stores['fips'].apply(is_blue, df2_final)
df_stores.info()

In [None]:
df_stores.describe()

In [None]:
import pickle

def return_count(fips, wf_counts):
    try:
        return wf_counts[fips]
    except:
        return 0

# df_stores['WholeFoods'] = df_stores.fips.apply(return_count, args=[wf_counts])
# df_stores['TractorSupply'] = df_stores.fips.apply(return_count, args=[ts_counts])
# df_stores['Starbucks'] = df_stores.fips.apply(return_count, args=[sb_counts])
# df_stores['AcademySports'] = df_stores.fips.apply(return_count, args=[as_counts])
# df_stores['DicksSports'] = df_stores.fips.apply(return_count, args=[ds_counts])
# df_stores['BassPro'] = df_stores.fips.apply(return_count, args=[bp_counts])
# df_stores['DollarTree'] = df_stores.fips.apply(return_count, args=[dt_counts])
# df_stores['Grainger'] = df_stores.fips.apply(return_count, args=[grg_counts])
# df_stores['ChickFila'] = df_stores.fips.apply(return_count, args=[cf_counts])
# df_stores['CrackerBarrel'] = df_stores.fips.apply(return_count, args=[cb_counts])
# df_stores['HarleyDavidson'] = df_stores.fips.apply(return_count, args=[hd_counts])
# df_stores['HM'] = df_stores.fips.apply(return_count, args=[hm_counts])
# df_stores['HobbyLobby'] = df_stores.fips.apply(return_count, args=[hl_counts])
#df_stores['LLBean'] = df_stores.fips.apply(return_count, args=[ll_counts])
#df_stores['PotteryBarn'] = df_stores.fips.apply(return_count, args=[pb_counts])
#df_stores['REI'] = df_stores.fips.apply(return_count, args=[rei_counts])
#df_stores['Target'] = df_stores.fips.apply(return_count, args=[tg_counts])
#df_stores['TraderJoes'] = df_stores.fips.apply(return_count, args=[tj_counts])
#df_stores['Walmart'] = df_stores.fips.apply(return_count, args=[wal_counts])

#df_stores.to_pickle("datasets/store_df.pkl")

In [None]:
#df_stores.to_pickle("datasets/store_df.pkl")



In [None]:


    
#make_choro(df_stores1, us_county_fix, 'WholeFoods', ['lightgray', 'green'], [0, 4])
#make_choro(df_stores2, us_county_fix, 'TractorSupply', ['lightgray', 'red'], [0, 4])
#make_choro(df_stores5, us_county_fix, 'Starbucks', ['white', 'green'], [0, 50])
#make_choro(df_stores5, us_county_fix, 'AcademySports', ['white', 'blue'], [0, 2])
#make_choro(df_stores5, us_county_fix, 'DicksSports', ['white', 'orange'], [0, 2])
#make_choro(df_stores7, us_county_fix, 'BassPro', ['lightyellow', 'red'], [0, 2])
#make_choro(df_stores7, us_county_fix, 'DollarTree', ['white', 'green'], [0, 50])
#make_choro(df_stores, us_county_fix, 'Grainger', ['lightgray', 'red'], [0, 3])
#make_choro(df_stores, us_county_fix, 'ChickFila', ['lightgray', 'red'], [0, 20])
#make_choro(df_stores, us_county_fix, 'CrackerBarrel', ['lightgray', 'gold'], [0, 4])
#make_choro(df_stores, us_county_fix, 'HarleyDavidson', ['lightgray', 'orange'], [0, 2])
#make_choro(df_stores, us_county_fix, 'HM', ['white', 'red'], [0, 2])
#make_choro(df_stores, us_county_fix, 'HobbyLobby', ['white', 'orange'], [0, 3])
#make_choro(df_stores, us_county_fix, 'LLBean', ['white', 'darkgreen'], [0, 1])
#make_choro(df_stores, us_county_fix, 'PotteryBarn', ['white', 'blue'], [0, 1])
#make_choro(df_stores, us_county_fix, 'REI', ['white', 'darkgreen'], [0, 1])
#make_choro(df_stores, us_county_fix, 'Target', ['white', 'red'], [0, 10])
#make_choro(df_stores, us_county_fix, 'TraderJoes', ['white', 'red'], [0, 5])
make_choro(df2_final, us_county_fix, 'blue', ['red', 'blue'], [0, 1])
#make_choro(df_stores, us_county_fix, 'blue', ['red', 'blue'], [0, 1])


In [None]:
# To complete my df_store and make it final, I want to do a couple experiments
# I want to simply add a population col
# I would like to make a second model using stores/person for each column

# Add population data
pop_df = pd.read_csv('datasets/county_pop.csv', encoding='latin')
#pop_df.to_csv('datasets/county_pop.csv')

# pop_df.info()
# for col in pop_df.columns:
#     print(col)
    
# Keep STATE, COUNTY, CTYNAME, POPESTIMATE2018, 
pop_df = pop_df[['STATE', 'COUNTY', 'CTYNAME', 'POPESTIMATE2018']]
pop_df.head()
fips_list = []

for i in range(len(pop_df)):
    fips = "{:02}{:03}".format(pop_df.iloc[i]['STATE'], pop_df.iloc[i]['COUNTY'])
    fips_list.append(fips)
    
pop_df['fips'] = pd.Series(fips_list)

pop_df.head()


In [None]:
us_county.columns

In [None]:
def get_pop(fips, pop_df):
    try:
        pop = pop_df[pop_df['fips']==fips]['POPESTIMATE2018'].values[0]
        return pop
    except:
        print(fips)
        return 0

def get_area(fips, us_county_df):
    try:
        area = us_county_df[us_county_df['fips']==fips]['CENSUSAREA']
        return float(area)
    except Exception as e:
        print(fips, e)
        return 0
    
us_county['fips'] = us_county['fips'].astype(str)
df_stores['population'] = df_stores.fips.apply(get_pop, args=[pop_df])
df_stores['area'] = df_stores.fips.apply(get_area, args=[us_county])

In [None]:
us_county.head()
df_stores.head()

In [None]:
# 6181
# the population of bedford city county va (fips 51515) is 6181.  Let's fill it in manually
df_stores.loc[df_stores['fips']=='51515']
df_stores.loc[5422,'population'] = 6181
df_stores.loc[df_stores['fips']=='51515']  # fixed


In [None]:
# Looks good, let's pickle it and start our ML project
#df_stores.to_pickle("datasets/house_df.pkl")
df_stores.to_pickle('datasets/all_df.pkl')
#df_stores.to_pickle('datasets/major_df.pkl')

In [None]:
# little extra to fix the df before finally sending it to pickle
# This code was run once only to fix the dataset

# import pickle
# import pandas as pd

# # import data

# with open('datasets/final_df.pkl', 'rb') as f:
#     df = pickle.load(f)
    
# #had an extra column and used this to fix it
# df = df.drop(df.columns[-1], axis=1)


# df.party = df.party.apply(lambda x: x.title())
# df.county = df.county.apply(lambda x: x.title())
# # df = df.drop(columns=['totalvotes'], axis=1)

# df.head()
# #df.to_pickle('datasets/final_df.pkl')
# df.to_pickle('datasets/house_df.pkl')

In [None]:
df_stores.head()