# County Source Data

Using Pandas to import csv file of Zipcodes to identify counties associated with cities in the Instacart dataset.  

The reason for this is that the Food Desert data is based on county level information and the only way to compare the areas is to utilize the data set and join in MongoDB.

### Import dependencies

In [1]:
import os
import pandas as pd
from pandas import DataFrame
import pymongo

### Setup DB connection and establish collection for storage

In [2]:
# Setup connection to MongoDB
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [3]:
# Create Collection to receive data (first drop and then )
db = client.food_desert_db
collection = db.zipcode_county_data
collection.drop()

In [4]:
collection = db.zipcode_county_data

### Import and read file

Bringing in path for datafile as well as reference file of states for Instacart Service area

In [5]:
csv_path = os.path.join('Data', 'zip_code_database.csv')
states_path = os.path.join('Data','instacart_states.csv')

In [6]:
states = pd.read_csv(states_path)
states.head()

Unnamed: 0.1,Unnamed: 0,state,counts
0,0,NY,851
1,1,PA,850
2,2,CA,704
3,3,TX,524
4,4,OH,513


In [7]:
pd.read_csv(csv_path, nrows=1).columns

Index(['zip', 'type', 'decommissioned', 'primary_city', 'acceptable_cities',
       'unacceptable_cities', 'state', 'county', 'timezone', 'area_codes',
       'world_region', 'country', 'latitude', 'longitude',
       'irs_estimated_population_2015'],
      dtype='object')

In [8]:
data = pd.read_csv(csv_path)[['primary_city', 'acceptable_cities','state', 'county']]
data.head(10)

Unnamed: 0,primary_city,acceptable_cities,state,county
0,Holtsville,,NY,Suffolk County
1,Holtsville,,NY,Suffolk County
2,Adjuntas,,PR,Adjuntas Municipio
3,Aguada,,PR,Aguada Municipio
4,Aguadilla,Ramey,PR,Aguadilla Municipio
5,Aguadilla,Ramey,PR,
6,Aguadilla,,PR,
7,Maricao,,PR,Maricao Municipio
8,Anasco,,PR,Anasco Municipio
9,Angeles,,PR,


In [9]:
data.count()

primary_city         42632
acceptable_cities     9023
state                42632
county               41790
dtype: int64

In [10]:
data.loc[data.isnull().any(axis=1)]

Unnamed: 0,primary_city,acceptable_cities,state,county
0,Holtsville,,NY,Suffolk County
1,Holtsville,,NY,Suffolk County
2,Adjuntas,,PR,Adjuntas Municipio
3,Aguada,,PR,Aguada Municipio
5,Aguadilla,Ramey,PR,
6,Aguadilla,,PR,
7,Maricao,,PR,Maricao Municipio
8,Anasco,,PR,Anasco Municipio
9,Angeles,,PR,
10,Arecibo,,PR,Arecibo Municipio


In [11]:
data.dropna(subset=['county'], inplace=True)

In [12]:
#data.drop(data.loc[data['state']=="PR"].index, inplace=True)
data = data[data['state'].isin(states['state'])]

In [13]:
data['acceptable_cities'].fillna(data['primary_city'],inplace=True)
data.head()

Unnamed: 0,primary_city,acceptable_cities,state,county
0,Holtsville,Holtsville,NY,Suffolk County
1,Holtsville,Holtsville,NY,Suffolk County
194,Agawam,Agawam,MA,Hampden County
195,Amherst,"Cushman, Pelham",MA,Hampshire County
196,Amherst,Amherst,MA,Hampshire County


In [14]:
data_duplicates_removed = DataFrame.drop_duplicates(data).copy()
data_duplicates_removed.count()

primary_city         31912
acceptable_cities    31912
state                31912
county               31912
dtype: int64

In [15]:
data_duplicates_removed.head()

Unnamed: 0,primary_city,acceptable_cities,state,county
0,Holtsville,Holtsville,NY,Suffolk County
194,Agawam,Agawam,MA,Hampden County
195,Amherst,"Cushman, Pelham",MA,Hampshire County
196,Amherst,Amherst,MA,Hampshire County
198,Barre,Barre,MA,Worcester County


### Add column to insert city, state so that it can be used to match up with the instacart data and drop unnecessary columns

In [16]:
data_duplicates_removed['primary_city-state'] = data_duplicates_removed['primary_city'] +', '+ data_duplicates_removed['state']
data_duplicates_removed['acceptable_city-state'] = data_duplicates_removed['acceptable_cities'] +', '+ data_duplicates_removed['state']
data_duplicates_removed['primary_city-state/county'] = data_duplicates_removed['primary_city'] +', '+ data_duplicates_removed['state'] +'/'+ data_duplicates_removed['county']
data_duplicates_removed['acceptable_city-state/county'] = data_duplicates_removed['acceptable_cities'] +', '+ data_duplicates_removed['state'] +'/'+ data_duplicates_removed['county']
data_duplicates_removed.head(10)

Unnamed: 0,primary_city,acceptable_cities,state,county,primary_city-state,acceptable_city-state,primary_city-state/county,acceptable_city-state/county
0,Holtsville,Holtsville,NY,Suffolk County,"Holtsville, NY","Holtsville, NY","Holtsville, NY/Suffolk County","Holtsville, NY/Suffolk County"
194,Agawam,Agawam,MA,Hampden County,"Agawam, MA","Agawam, MA","Agawam, MA/Hampden County","Agawam, MA/Hampden County"
195,Amherst,"Cushman, Pelham",MA,Hampshire County,"Amherst, MA","Cushman, Pelham, MA","Amherst, MA/Hampshire County","Cushman, Pelham, MA/Hampshire County"
196,Amherst,Amherst,MA,Hampshire County,"Amherst, MA","Amherst, MA","Amherst, MA/Hampshire County","Amherst, MA/Hampshire County"
198,Barre,Barre,MA,Worcester County,"Barre, MA","Barre, MA","Barre, MA/Worcester County","Barre, MA/Worcester County"
199,Belchertown,Belchertown,MA,Hampshire County,"Belchertown, MA","Belchertown, MA","Belchertown, MA/Hampshire County","Belchertown, MA/Hampshire County"
200,Blandford,Blandford,MA,Hampden County,"Blandford, MA","Blandford, MA","Blandford, MA/Hampden County","Blandford, MA/Hampden County"
201,Bondsville,Bondsville,MA,Hampden County,"Bondsville, MA","Bondsville, MA","Bondsville, MA/Hampden County","Bondsville, MA/Hampden County"
202,Brimfield,Brimfield,MA,Hampden County,"Brimfield, MA","Brimfield, MA","Brimfield, MA/Hampden County","Brimfield, MA/Hampden County"
203,Chester,Chester,MA,Hampden County,"Chester, MA","Chester, MA","Chester, MA/Hampden County","Chester, MA/Hampden County"


In [17]:
data_duplicates_removed['primary_city-state/county'].value_counts()

Pittsburgh, PA/Allegheny County           41
Miami, FL/Miami-Dade County               35
Cincinnati, OH/Hamilton County            32
Los Angeles, CA/Los Angeles County        29
Louisville, KY/Jefferson County           26
Fort Lauderdale, FL/Broward County        22
Saint Louis, MO/St. Louis County          21
Cleveland, OH/Cuyahoga County             20
Minneapolis, MN/Hennepin County           20
Indianapolis, IN/Marion County            19
Salt Lake City, UT/Salt Lake County       18
San Antonio, TX/Bexar County              17
Birmingham, AL/Jefferson County           17
New York, NY/New York County              16
Fort Worth, TX/Tarrant County             15
Buffalo, NY/Erie County                   15
Tampa, FL/Hillsborough County             15
Orlando, FL/Orange County                 15
Milwaukee, WI/Milwaukee County            14
Dayton, OH/Montgomery County              14
Atlanta, GA/Fulton County                 13
Saint Petersburg, FL/Pinellas County      13
Seattle, W

In [18]:
data_duplicates_removed['acceptable_city-state/county'].value_counts()

Flushing, NY/Queens County                                                                 12
Boston, MA/Suffolk County                                                                  11
Baltimore, MD/Baltimore County                                                             11
Jamaica, NY/Queens County                                                                  10
Cleveland, OH/Cuyahoga County                                                              10
Detroit, MI/Wayne County                                                                    7
Richmond, RI/Washington County                                                              6
Gallup, NM/McKinley County                                                                  6
Kingman, AZ/Mohave County                                                                   5
Flagstaff, AZ/Coconino County                                                               5
Chinle, AZ/Apache County                                    

In [19]:
primary_cities = data_duplicates_removed[['primary_city-state/county']]
primary_cities.rename(columns = {'primary_city-state/county': 'city-state/county'}, inplace = True) 
primary_cities.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


Unnamed: 0,city-state/county
0,"Holtsville, NY/Suffolk County"
194,"Agawam, MA/Hampden County"
195,"Amherst, MA/Hampshire County"
196,"Amherst, MA/Hampshire County"
198,"Barre, MA/Worcester County"


In [20]:
acceptable_cities = data_duplicates_removed[['acceptable_city-state/county']]
acceptable_cities.rename(columns = {'acceptable_city-state/county': 'city-state/county'}, inplace = True) 
acceptable_cities.head()

Unnamed: 0,city-state/county
0,"Holtsville, NY/Suffolk County"
194,"Agawam, MA/Hampden County"
195,"Cushman, Pelham, MA/Hampshire County"
196,"Amherst, MA/Hampshire County"
198,"Barre, MA/Worcester County"


In [21]:
all_cities = pd.merge(primary_cities, acceptable_cities, on='city-state/county', how='outer')
all_cities.head()

Unnamed: 0,city-state/county
0,"Holtsville, NY/Suffolk County"
1,"Agawam, MA/Hampden County"
2,"Amherst, MA/Hampshire County"
3,"Amherst, MA/Hampshire County"
4,"Amherst, MA/Hampshire County"


In [22]:
all_cities.count()

city-state/county    40503
dtype: int64

In [23]:
all_cities[['city-state','county']] = all_cities['city-state/county'].str.split('/', n=1, expand=True)
all_cities.head()

Unnamed: 0,city-state/county,city-state,county
0,"Holtsville, NY/Suffolk County","Holtsville, NY",Suffolk County
1,"Agawam, MA/Hampden County","Agawam, MA",Hampden County
2,"Amherst, MA/Hampshire County","Amherst, MA",Hampshire County
3,"Amherst, MA/Hampshire County","Amherst, MA",Hampshire County
4,"Amherst, MA/Hampshire County","Amherst, MA",Hampshire County


In [24]:
collection.insert_many(all_cities.to_dict('records'))

<pymongo.results.InsertManyResult at 0x251544aa888>

In [25]:
listings = db.zipcode_county_data.find()

for listing in listings:
    print(listing)

{'_id': ObjectId('5e1a977963f1268ef8667c9d'), 'city-state/county': 'Holtsville, NY/Suffolk County', 'city-state': 'Holtsville, NY', 'county': 'Suffolk County'}
{'_id': ObjectId('5e1a977963f1268ef8667c9e'), 'city-state/county': 'Agawam, MA/Hampden County', 'city-state': 'Agawam, MA', 'county': 'Hampden County'}
{'_id': ObjectId('5e1a977963f1268ef8667c9f'), 'city-state/county': 'Amherst, MA/Hampshire County', 'city-state': 'Amherst, MA', 'county': 'Hampshire County'}
{'_id': ObjectId('5e1a977963f1268ef8667ca0'), 'city-state/county': 'Amherst, MA/Hampshire County', 'city-state': 'Amherst, MA', 'county': 'Hampshire County'}
{'_id': ObjectId('5e1a977963f1268ef8667ca1'), 'city-state/county': 'Amherst, MA/Hampshire County', 'city-state': 'Amherst, MA', 'county': 'Hampshire County'}
{'_id': ObjectId('5e1a977963f1268ef8667ca2'), 'city-state/county': 'Amherst, MA/Hampshire County', 'city-state': 'Amherst, MA', 'county': 'Hampshire County'}
{'_id': ObjectId('5e1a977963f1268ef8667ca3'), 'city-stat

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



{'_id': ObjectId('5e1a977a63f1268ef866f6c0'), 'city-state/county': 'San Luis Obispo, CA/San Luis Obispo County', 'city-state': 'San Luis Obispo, CA', 'county': 'San Luis Obispo County'}
{'_id': ObjectId('5e1a977a63f1268ef866f6c1'), 'city-state/county': 'San Luis Obispo, CA/San Luis Obispo County', 'city-state': 'San Luis Obispo, CA', 'county': 'San Luis Obispo County'}
{'_id': ObjectId('5e1a977a63f1268ef866f6c2'), 'city-state/county': 'Los Osos, CA/San Luis Obispo County', 'city-state': 'Los Osos, CA', 'county': 'San Luis Obispo County'}
{'_id': ObjectId('5e1a977a63f1268ef866f6c3'), 'city-state/county': 'Arroyo Grande, CA/San Luis Obispo County', 'city-state': 'Arroyo Grande, CA', 'county': 'San Luis Obispo County'}
{'_id': ObjectId('5e1a977a63f1268ef866f6c4'), 'city-state/county': 'Atascadero, CA/San Luis Obispo County', 'city-state': 'Atascadero, CA', 'county': 'San Luis Obispo County'}
{'_id': ObjectId('5e1a977a63f1268ef866f6c5'), 'city-state/county': 'Avila Beach, CA/San Luis Obisp