In [1]:
#Dependencies
import os
import csv
import pandas as pd

In [2]:
# Save path to data set in a variable
population_file = "sub-est2016_all.csv"
# Source: https://www2.census.gov/programs-surveys/popest/datasets/2010-2016/cities/totals/sub-est2016_all.csv

states_file = "50_us_states_all_data.csv"
# Source: https://scottontechnology.com/list-of-50-us-states-in-excel/

In [3]:
# Use Pandas to read data
population_pd = pd.read_csv(population_file, encoding="ISO-8859-1")
population_pd.head()

Unnamed: 0,SUMLEV,STATE,COUNTY,PLACE,COUSUB,CONCIT,PRIMGEO_FLAG,FUNCSTAT,NAME,STNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014,POPESTIMATE2015,POPESTIMATE2016
0,40,1,0,0,0,0,0,A,Alabama,Alabama,4779736,4780131,4785492,4799918,4815960,4829479,4843214,4853875,4863300
1,162,1,0,124,0,0,0,A,Abbeville city,Alabama,2688,2688,2683,2685,2647,2631,2619,2616,2603
2,162,1,0,460,0,0,0,A,Adamsville city,Alabama,4522,4522,4517,4495,4472,4447,4428,4395,4360
3,162,1,0,484,0,0,0,A,Addison town,Alabama,758,756,754,753,748,748,747,740,738
4,162,1,0,676,0,0,0,A,Akron town,Alabama,356,356,355,345,345,342,337,337,334


In [4]:
# Clean Up by referencing multiple columns within a DataFrame
population_df = population_pd[["NAME", "STNAME", "POPESTIMATE2016"]]
population_df.head()

Unnamed: 0,NAME,STNAME,POPESTIMATE2016
0,Alabama,Alabama,4863300
1,Abbeville city,Alabama,2603
2,Adamsville city,Alabama,4360
3,Addison town,Alabama,738
4,Akron town,Alabama,334


In [5]:
# Use Pandas to read list of States and Abbreviation
states_pd = pd.read_csv(states_file, encoding="ISO-8859-1", header = None)
states_pd.head()

Unnamed: 0,0,1,2,3
0,ALABAMA,Alabama,AL,Ala.
1,ALASKA,Alaska,AK,Alaska
2,ARIZONA,Arizona,AZ,Ariz.
3,ARKANSAS,Arkansas,AR,Ark.
4,CALIFORNIA,California,CA,Calif.


In [6]:
# Rename header for list of States and Abbreviation
states_pd.columns=['States_CAP',
                   'STNAME',
                   'State',
                   'Abbrev']

states_pd.head()

Unnamed: 0,States_CAP,STNAME,State,Abbrev
0,ALABAMA,Alabama,AL,Ala.
1,ALASKA,Alaska,AK,Alaska
2,ARIZONA,Arizona,AZ,Ariz.
3,ARKANSAS,Arkansas,AR,Ark.
4,CALIFORNIA,California,CA,Calif.


In [7]:
# Clean Up by referencing multiple columns within a DataFrame
states_df = states_pd[["STNAME", "State"]]
states_df.head()

Unnamed: 0,STNAME,State
0,Alabama,AL
1,Alaska,AK
2,Arizona,AZ
3,Arkansas,AR
4,California,CA


In [8]:
# Merge two dataframes using a left join
merged_df = pd.merge(states_df, population_df, on="STNAME", how="right")
merged_df.tail()

Unnamed: 0,STNAME,State,NAME,POPESTIMATE2016
81580,Wyoming,WY,Balance of Weston County,2595
81581,District of Columbia,,District of Columbia,681170
81582,District of Columbia,,Washington city,681170
81583,District of Columbia,,District of Columbia,681170
81584,District of Columbia,,Washington city,681170


In [9]:
# Clean Up data by filling in Abbrev Column with NAN values as DC 
merged_df = merged_df.fillna("DC")
merged_df.tail()

Unnamed: 0,STNAME,State,NAME,POPESTIMATE2016
81580,Wyoming,WY,Balance of Weston County,2595
81581,District of Columbia,DC,District of Columbia,681170
81582,District of Columbia,DC,Washington city,681170
81583,District of Columbia,DC,District of Columbia,681170
81584,District of Columbia,DC,Washington city,681170


In [10]:
# Convert NAME State & City Column to all uppercase

merged_df["NAME"] = merged_df["NAME"].str.upper()
merged_df.head()

Unnamed: 0,STNAME,State,NAME,POPESTIMATE2016
0,Alabama,AL,ALABAMA,4863300
1,Alabama,AL,ABBEVILLE CITY,2603
2,Alabama,AL,ADAMSVILLE CITY,4360
3,Alabama,AL,ADDISON TOWN,738
4,Alabama,AL,AKRON TOWN,334


In [11]:
# Push the merged Population DataFrame to a new CSV file
merged_df.to_csv("population_clean.csv",
                  encoding="utf-8", index=True, header=True)

In [12]:
##########################  TEST CODES ##########################

In [13]:
# TEST CODE to grab the data contained within the "NAME" column in population_clean.csv using RED WING (city where Mayo clinic is located)
test = merged_df[merged_df["NAME"].str.contains("RED WING")]
test

Unnamed: 0,STNAME,State,NAME,POPESTIMATE2016
33176,Minnesota,MN,RED WING CITY,16526
34705,Minnesota,MN,RED WING CITY,16526
34736,Minnesota,MN,RED WING CITY,16526
34737,Minnesota,MN,RED WING CITY,16526


In [31]:
test.columns

Index(['STNAME', 'State', 'NAME', 'POPESTIMATE2016'], dtype='object')

In [53]:
POP = merged_df[['State', 'STNAME', 'NAME', 'POPESTIMATE2016']]
POP.head()

Unnamed: 0,State,STNAME,NAME,POPESTIMATE2016
0,AL,Alabama,ALABAMA,4863300
1,AL,Alabama,ABBEVILLE CITY,2603
2,AL,Alabama,ADAMSVILLE CITY,4360
3,AL,Alabama,ADDISON TOWN,738
4,AL,Alabama,AKRON TOWN,334


In [49]:
# Save path to TEST data frame in a variable
df_file = "test_df.csv"
df_file

'test_df.csv'

In [50]:
# Read TEST data frame
test_pd = pd.read_csv(df_file)
test_pd

Unnamed: 0,Hospital,Location,City,City Population,County,County Population,Mortality Rates (by county),State,Survey Rating,Surveys Completed,Survey Response Rate (%)
0,MAYO CLINIC HEALTH SYSTEM - RED WING,"701 HEWITT BOULEVARD, PO BOX 95\nRED WING, MN\...",RED WING,,GOODHUE,,,MN,,347,33
1,PROMEDICA HERRICK HOSPITAL,"500 E POTTAWATAMIE STREET\nTECUMSEH, MI\n(42.0...",TECUMSEH,,LENAWEE,,,MI,,Not Available,Not Available
2,BRONSON LAKEVIEW HOSPITAL,"408 HAZEN STREET\nPAW PAW, MI\n(42.221009, -85...",PAW PAW,,VAN BUREN,,,MI,,130,30
3,MAYO CLINIC HOSPITAL ROCHESTER,"1216 SECOND STREET SOUTHWEST\nROCHESTER, MN\n(...",ROCHESTER,,OLMSTED,,,MN,,,


In [51]:
test_pd.columns

Index(['Hospital', 'Location', 'City', 'City Population', 'County',
       'County Population', 'Mortality Rates (by county)', 'State',
       'Survey Rating', 'Surveys Completed', 'Survey Response Rate (%)'],
      dtype='object')

In [52]:
PE = test_pd[['State', 'City','City Population','Hospital', 'Location',  'County',
       'County Population', 'Mortality Rates (by county)', 'Survey Rating', 'Surveys Completed', 'Survey Response Rate (%)']]
PE.head()

Unnamed: 0,State,City,City Population,Hospital,Location,County,County Population,Mortality Rates (by county),Survey Rating,Surveys Completed,Survey Response Rate (%)
0,MN,RED WING,,MAYO CLINIC HEALTH SYSTEM - RED WING,"701 HEWITT BOULEVARD, PO BOX 95\nRED WING, MN\...",GOODHUE,,,,347,33
1,MI,TECUMSEH,,PROMEDICA HERRICK HOSPITAL,"500 E POTTAWATAMIE STREET\nTECUMSEH, MI\n(42.0...",LENAWEE,,,,Not Available,Not Available
2,MI,PAW PAW,,BRONSON LAKEVIEW HOSPITAL,"408 HAZEN STREET\nPAW PAW, MI\n(42.221009, -85...",VAN BUREN,,,,130,30
3,MN,ROCHESTER,,MAYO CLINIC HOSPITAL ROCHESTER,"1216 SECOND STREET SOUTHWEST\nROCHESTER, MN\n(...",OLMSTED,,,,,


In [60]:
grouped_PE = PE.groupby(['State', 'City'])

# The object returned is a "GroupBy" object and cannot be viewed normally...
print(grouped_PE)

# In order to be visualized, a data function must be used...
grouped_PE.count().head()

<pandas.core.groupby.DataFrameGroupBy object at 0x00000254DB3DB4E0>


Unnamed: 0_level_0,Unnamed: 1_level_0,City Population,Hospital,Location,County,County Population,Mortality Rates (by county),Survey Rating,Surveys Completed,Survey Response Rate (%)
State,City,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
MI,PAW PAW,0,1,1,1,0,0,0,1,1
MI,TECUMSEH,0,1,1,1,0,0,0,1,1
MN,RED WING,0,1,1,1,0,0,0,1,1
MN,ROCHESTER,0,1,1,1,0,0,0,0,0


In [69]:
# Loop through PE (df2) and POP (df1) dfs

# for index, row in grouped_PE.apply():

# 1) Match by State
    if PE["State"] == POP["State"]
       print("Yes")
    
        else:
       print("No")

   # 2) If PE['City'] is in POP['City'], 
        # add the corresponding value of POP['POPESTIMATE2016'] to the row of PE['City Population'] where there is a match.

# xx=POP.set_index('City').values.tolist()
# df2['New']=df2.code.apply(lambda x : [y for y in xx if x in y] )
# df2=df2[df2.New.apply(len)>0]
# df2['New']=df2.New.apply(pd.Series)[0].apply(lambda x : ','.join(x))
# df2

# What I need to do is, if df2['code'] is in df1['allcodes'], 
# add the corresponding value of allcodes to the end of a row in df2 where there is a match.

#https://stackoverflow.com/questions/46591928/python-pandas-add-value-from-one-df-to-end-of-row-in-another-df-if-there-is-a-m


IndentationError: unexpected indent (<ipython-input-69-2f9721e7cefc>, line 6)