# County Source Data

Using Pandas to import csv file of Zipcodes to identify counties associated with cities in the Instacart dataset.  

The reason for this is that the Food Desert data is based on county level information and the only way to compare the areas is to utilize the data set and join in MongoDB.

### Import dependencies

In [1]:
import os
import pandas as pd
from pandas import DataFrame
import pymongo

### Setup DB connection and establish collection for storage

In [2]:
# Setup connection to MongoDB
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [3]:
# Create Collection to receive data
db = client.food_desert_db
collection = db.zipcode_county_data

### Import and read file

In [4]:
csv_path = os.path.join('Data', 'zip_code_database.csv')
#zips_df = pd.read_csv(csv_path)
#zips_df.head(10)

In [5]:
pd.read_csv(csv_path, nrows=1).columns

Index(['zip', 'type', 'decommissioned', 'primary_city', 'acceptable_cities',
       'unacceptable_cities', 'state', 'county', 'timezone', 'area_codes',
       'world_region', 'country', 'latitude', 'longitude',
       'irs_estimated_population_2015'],
      dtype='object')

In [6]:
data = pd.read_csv(csv_path)[['primary_city', 'acceptable_cities','state', 'county']]
data.head(10)

Unnamed: 0,primary_city,acceptable_cities,state,county
0,Holtsville,,NY,Suffolk County
1,Holtsville,,NY,Suffolk County
2,Adjuntas,,PR,Adjuntas Municipio
3,Aguada,,PR,Aguada Municipio
4,Aguadilla,Ramey,PR,Aguadilla Municipio
5,Aguadilla,Ramey,PR,
6,Aguadilla,,PR,
7,Maricao,,PR,Maricao Municipio
8,Anasco,,PR,Anasco Municipio
9,Angeles,,PR,


In [7]:
data.count()

primary_city         42632
acceptable_cities     9023
state                42632
county               41790
dtype: int64

In [8]:
data.loc[data.isnull().any(axis=1)]

Unnamed: 0,primary_city,acceptable_cities,state,county
0,Holtsville,,NY,Suffolk County
1,Holtsville,,NY,Suffolk County
2,Adjuntas,,PR,Adjuntas Municipio
3,Aguada,,PR,Aguada Municipio
5,Aguadilla,Ramey,PR,
6,Aguadilla,,PR,
7,Maricao,,PR,Maricao Municipio
8,Anasco,,PR,Anasco Municipio
9,Angeles,,PR,
10,Arecibo,,PR,Arecibo Municipio


In [9]:
data.dropna(subset=['county'], inplace=True)

In [10]:
data['acceptable_cities'].fillna(data['primary_city'],inplace=True)
data.head()

Unnamed: 0,primary_city,acceptable_cities,state,county
0,Holtsville,Holtsville,NY,Suffolk County
1,Holtsville,Holtsville,NY,Suffolk County
2,Adjuntas,Adjuntas,PR,Adjuntas Municipio
3,Aguada,Aguada,PR,Aguada Municipio
4,Aguadilla,Ramey,PR,Aguadilla Municipio


In [11]:
data_duplicates_removed = DataFrame.drop_duplicates(data).copy()
data_duplicates_removed.count()

primary_city         32033
acceptable_cities    32033
state                32033
county               32033
dtype: int64

In [12]:
data_duplicates_removed.head()

Unnamed: 0,primary_city,acceptable_cities,state,county
0,Holtsville,Holtsville,NY,Suffolk County
2,Adjuntas,Adjuntas,PR,Adjuntas Municipio
3,Aguada,Aguada,PR,Aguada Municipio
4,Aguadilla,Ramey,PR,Aguadilla Municipio
7,Maricao,Maricao,PR,Maricao Municipio


### Add column to insert city, state so that it can be used to match up with the instacart data and drop unnecessary columns

In [13]:
data_duplicates_removed['primary_city-state'] = data_duplicates_removed['primary_city'] +', '+ data_duplicates_removed['state']
data_duplicates_removed['acceptable_city-state'] = data_duplicates_removed['acceptable_cities'] +', '+ data_duplicates_removed['state']
data_duplicates_removed['primary_city-state/county'] = data_duplicates_removed['primary_city'] +', '+ data_duplicates_removed['state'] +'/'+ data_duplicates_removed['county']
data_duplicates_removed['acceptable_city-state/county'] = data_duplicates_removed['acceptable_cities'] +', '+ data_duplicates_removed['state'] +'/'+ data_duplicates_removed['county']
data_duplicates_removed.head(10)

Unnamed: 0,primary_city,acceptable_cities,state,county,primary_city-state,acceptable_city-state,primary_city-state/county,acceptable_city-state/county
0,Holtsville,Holtsville,NY,Suffolk County,"Holtsville, NY","Holtsville, NY","Holtsville, NY/Suffolk County","Holtsville, NY/Suffolk County"
2,Adjuntas,Adjuntas,PR,Adjuntas Municipio,"Adjuntas, PR","Adjuntas, PR","Adjuntas, PR/Adjuntas Municipio","Adjuntas, PR/Adjuntas Municipio"
3,Aguada,Aguada,PR,Aguada Municipio,"Aguada, PR","Aguada, PR","Aguada, PR/Aguada Municipio","Aguada, PR/Aguada Municipio"
4,Aguadilla,Ramey,PR,Aguadilla Municipio,"Aguadilla, PR","Ramey, PR","Aguadilla, PR/Aguadilla Municipio","Ramey, PR/Aguadilla Municipio"
7,Maricao,Maricao,PR,Maricao Municipio,"Maricao, PR","Maricao, PR","Maricao, PR/Maricao Municipio","Maricao, PR/Maricao Municipio"
8,Anasco,Anasco,PR,Anasco Municipio,"Anasco, PR","Anasco, PR","Anasco, PR/Anasco Municipio","Anasco, PR/Anasco Municipio"
10,Arecibo,Arecibo,PR,Arecibo Municipio,"Arecibo, PR","Arecibo, PR","Arecibo, PR/Arecibo Municipio","Arecibo, PR/Arecibo Municipio"
13,Bajadero,Bajadero,PR,Arecibo Municipio,"Bajadero, PR","Bajadero, PR","Bajadero, PR/Arecibo Municipio","Bajadero, PR/Arecibo Municipio"
14,Barceloneta,Barceloneta,PR,Barceloneta Municipio,"Barceloneta, PR","Barceloneta, PR","Barceloneta, PR/Barceloneta Municipio","Barceloneta, PR/Barceloneta Municipio"
15,Boqueron,Boqueron,PR,Cabo Rojo Municipio,"Boqueron, PR","Boqueron, PR","Boqueron, PR/Cabo Rojo Municipio","Boqueron, PR/Cabo Rojo Municipio"


In [14]:
data_duplicates_removed['primary_city-state/county'].value_counts()

Pittsburgh, PA/Allegheny County              41
Miami, FL/Miami-Dade County                  35
Cincinnati, OH/Hamilton County               32
Los Angeles, CA/Los Angeles County           29
Louisville, KY/Jefferson County              26
Fort Lauderdale, FL/Broward County           22
Saint Louis, MO/St. Louis County             21
Minneapolis, MN/Hennepin County              20
Cleveland, OH/Cuyahoga County                20
Indianapolis, IN/Marion County               19
Salt Lake City, UT/Salt Lake County          18
Birmingham, AL/Jefferson County              17
San Antonio, TX/Bexar County                 17
San Juan, PR/San Juan Municipio              17
New York, NY/New York County                 16
Orlando, FL/Orange County                    15
Buffalo, NY/Erie County                      15
Fort Worth, TX/Tarrant County                15
Tampa, FL/Hillsborough County                15
Milwaukee, WI/Milwaukee County               14
Dayton, OH/Montgomery County            

In [16]:
data_duplicates_removed['acceptable_city-state/county'].value_counts()

Flushing, NY/Queens County                                              12
Boston, MA/Suffolk County                                               11
Baltimore, MD/Baltimore County                                          11
Jamaica, NY/Queens County                                               10
Cleveland, OH/Cuyahoga County                                           10
Detroit, MI/Wayne County                                                 7
Richmond, RI/Washington County                                           6
Gallup, NM/McKinley County                                               6
Fairbanks, AK/Fairbanks North Star Borough                               5
Kingman, AZ/Mohave County                                                5
West Deptford, NJ/Gloucester County                                      5
Chinle, AZ/Apache County                                                 5
Flagstaff, AZ/Coconino County                                            5
Severance, CO/Weld County

In [18]:
primary_cities = data_duplicates_removed[['primary_city-state/county']]
primary_cities.rename(columns = {'primary_city-state/county': 'city-state/county'}, 
                                 inplace = True) 
primary_cities.head()

Unnamed: 0,city-state/county
0,"Holtsville, NY/Suffolk County"
2,"Adjuntas, PR/Adjuntas Municipio"
3,"Aguada, PR/Aguada Municipio"
4,"Aguadilla, PR/Aguadilla Municipio"
7,"Maricao, PR/Maricao Municipio"


In [19]:
acceptable_cities = data_duplicates_removed[['acceptable_city-state/county']]
acceptable_cities.rename(columns = {'acceptable_city-state/county': 'city-state/county'}, 
                                 inplace = True) 
acceptable_cities.head()

Unnamed: 0,city-state/county
0,"Holtsville, NY/Suffolk County"
2,"Adjuntas, PR/Adjuntas Municipio"
3,"Aguada, PR/Aguada Municipio"
4,"Ramey, PR/Aguadilla Municipio"
7,"Maricao, PR/Maricao Municipio"


In [20]:
all_cities = pd.merge(primary_cities, acceptable_cities, on='city-state/county', how='outer')
all_cities.head()

Unnamed: 0,city-state/county
0,"Holtsville, NY/Suffolk County"
1,"Adjuntas, PR/Adjuntas Municipio"
2,"Aguada, PR/Aguada Municipio"
3,"Aguadilla, PR/Aguadilla Municipio"
4,"Maricao, PR/Maricao Municipio"


In [21]:
all_cities.count()

city-state/county    40652
dtype: int64

In [23]:
all_cities[['city-state','county']] = all_cities['city-state/county'].str.split('/', n=1, expand=True)
all_cities.head()

Unnamed: 0,city-state/county,city-state,county
0,"Holtsville, NY/Suffolk County","Holtsville, NY",Suffolk County
1,"Adjuntas, PR/Adjuntas Municipio","Adjuntas, PR",Adjuntas Municipio
2,"Aguada, PR/Aguada Municipio","Aguada, PR",Aguada Municipio
3,"Aguadilla, PR/Aguadilla Municipio","Aguadilla, PR",Aguadilla Municipio
4,"Maricao, PR/Maricao Municipio","Maricao, PR",Maricao Municipio


In [24]:
collection.insert_many(all_cities.to_dict('records'))

<pymongo.results.InsertManyResult at 0x1d77fb9e7c8>

In [25]:
listings = db.zipcode_county_data.find()

for listing in listings:
    print(listing)

{'_id': ObjectId('5e1a1b6e591e2ce07ac9b253'), 'city-state/county': 'Holtsville, NY/Suffolk County', 'city-state': 'Holtsville, NY', 'county': 'Suffolk County'}
{'_id': ObjectId('5e1a1b6e591e2ce07ac9b254'), 'city-state/county': 'Adjuntas, PR/Adjuntas Municipio', 'city-state': 'Adjuntas, PR', 'county': 'Adjuntas Municipio'}
{'_id': ObjectId('5e1a1b6e591e2ce07ac9b255'), 'city-state/county': 'Aguada, PR/Aguada Municipio', 'city-state': 'Aguada, PR', 'county': 'Aguada Municipio'}
{'_id': ObjectId('5e1a1b6e591e2ce07ac9b256'), 'city-state/county': 'Aguadilla, PR/Aguadilla Municipio', 'city-state': 'Aguadilla, PR', 'county': 'Aguadilla Municipio'}
{'_id': ObjectId('5e1a1b6e591e2ce07ac9b257'), 'city-state/county': 'Maricao, PR/Maricao Municipio', 'city-state': 'Maricao, PR', 'county': 'Maricao Municipio'}
{'_id': ObjectId('5e1a1b6e591e2ce07ac9b258'), 'city-state/county': 'Anasco, PR/Anasco Municipio', 'city-state': 'Anasco, PR', 'county': 'Anasco Municipio'}
{'_id': ObjectId('5e1a1b6e591e2ce07a

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)




{'_id': ObjectId('5e1a1b6e591e2ce07aca37a5'), 'city-state/county': 'White Riv Jct, VT/Windsor County', 'city-state': 'White Riv Jct, VT', 'county': 'Windsor County'}
{'_id': ObjectId('5e1a1b6e591e2ce07aca37a6'), 'city-state/county': 'Brdgewtr Cors, Bridgewtr Cor, VT/Windsor County', 'city-state': 'Brdgewtr Cors, Bridgewtr Cor, VT', 'county': 'Windsor County'}
{'_id': ObjectId('5e1a1b6e591e2ce07aca37a7'), 'city-state/county': 'Ryegate, VT/Caledonia County', 'city-state': 'Ryegate, VT', 'county': 'Caledonia County'}
{'_id': ObjectId('5e1a1b6e591e2ce07aca37a8'), 'city-state/county': 'Hartland Cors, VT/Windsor County', 'city-state': 'Hartland Cors, VT', 'county': 'Windsor County'}
{'_id': ObjectId('5e1a1b6e591e2ce07aca37a9'), 'city-state/county': 'Mc Indoe Fls, VT/Caledonia County', 'city-state': 'Mc Indoe Fls, VT', 'county': 'Caledonia County'}
{'_id': ObjectId('5e1a1b6e591e2ce07aca37aa'), 'city-state/county': 'N Hartland, VT/Windsor County', 'city-state': 'N Hartland, VT', 'county': 'Wi