# Transportation in California Counties 

## Preliminaries

In [1]:
# Dependencies
import pylab as plt
import pandas as pd
import numpy as np
import os
import time

## Extract

In [2]:
# File Paths to CSVs we are importing
traffic_modes_path = os.path.join("Resources", "transportation-to-work.csv")
traffic_injuries_path = os.path.join("Resources", "road-traffic-injuries.csv")

# Use Pandas to read CSV files
traffic_modes_df = pd.read_csv(traffic_modes_path, low_memory=False)
traffic_injuries_df = pd.read_csv(traffic_injuries_path, low_memory=False)

## Transform
### Group Datasets to Build Category Specific Tables

In [5]:
# Build reference tables orgnaized by county, region, transportation modes, and race and ethnicity
### Tables for traffic_modes_df
TM_counties_df = pd.DataFrame\
    (traffic_modes_df.groupby(['county_name', 'county_fips', 'region_code']).size())

TM_regions_df = pd.DataFrame\
    (traffic_modes_df.groupby(['region_name', 'region_code']).size())

TM_transport_modes_df = pd.DataFrame\
    (traffic_modes_df.groupby(['mode', 'mode_name']).size()) 

TM_race_eth_df = pd.DataFrame\
    (traffic_modes_df.groupby(['race_eth_name', 'race_eth_code']).size())  

### Tables for traffic_injuries_df
TI_counties_df = pd.DataFrame\
    (traffic_injuries_df.groupby(['county_name', 'county_fips', 'region_code']).size())

TI_regions_df = pd.DataFrame\
    (traffic_injuries_df.groupby(['region_name', 'region_code']).size())

TI_transport_modes_df = pd.DataFrame\
    (traffic_injuries_df.groupby(['mode']).size())                           

TI_race_eth_df = pd.DataFrame\
    (traffic_injuries_df.groupby(['race_eth_name', 'race_eth_code']).size())

In [6]:
# Delete unecessary columns and reset indices
df_list = [TM_counties_df, TM_regions_df, 
           TM_transport_modes_df, TM_race_eth_df,
           TI_counties_df, TI_regions_df, 
           TI_transport_modes_df, TI_race_eth_df]

for df in df_list:
    df.reset_index(inplace=True)
    del df[0]

In [40]:
# Join traffic_modes tables to traffic_injuries table for all categories accept transportation modes
counties_joined = pd.merge(TM_counties_df, TI_counties_df, \
                           on=['county_name', 'county_fips', 'region_code'], how='outer')

regions_joined = pd.merge(TM_regions_df, TI_regions_df, \
                          on=['region_name', 'region_code'], how='outer')

race_eth_joined = pd.merge(TM_race_eth_df, TI_race_eth_df, \
                           on=['race_eth_name', 'race_eth_code'], how='outer')

In [41]:
# We want to provide total population per county, only traffic_injuries_df supplies this info
total_pop_per_county = traffic_injuries_df\
    [['county_name','totalpop']].groupby(['county_name']).sum()

# Reset the index so that 'county_name' is treated as a column
total_pop_per_county.reset_index(inplace=True)
total_pop_per_county.head()

Unnamed: 0,county_name,totalpop
0,Alameda,486198500.0
1,Alpine,162887.7
2,Amador,6461525.0
3,Butte,54740990.0
4,Calaveras,7369064.0


In [43]:
# Add totalpop to counties_joined
counties_pop_joined = pd.merge\
    (counties_joined, total_pop_per_county, on='county_name', how='outer')

# Adjust number format of 'totalpop' column
counties_pop_joined['totalpop'] = counties_pop_joined['totalpop']/(10**5)
counties_pop_joined['totalpop'] = counties_pop_joined['totalpop'].round(2)

counties_pop_joined.head()

Unnamed: 0,county_name,county_fips,region_code,totalpop
0,Alameda,6001.0,1.0,4861.99
1,Alpine,6003.0,3.0,1.63
2,Amador,6005.0,3.0,64.62
3,Butte,6007.0,2.0,547.41
4,Calaveras,6009.0,3.0,73.69


### Filter Datasets for Desired Statistics

In [122]:
# Create a table that documents workers modes of transportation
# and is linked to the counties/race_eth tables 
modes_of_transportation = traffic_modes_df\
    [['mode','mode_name', 'pop_mode','county_fips', 'race_eth_code']].\
    ### We are only concerned with data corresponding to counties 
    ### so disregard rows with data corresponding to other geotypes
    loc[traffic_modes_df['county_fips'].isna() == False]

# Reset index so that it starts from 0
modes_of_transportation.reset_index(level=0, inplace=True); del modes_of_transportation['index']

# NAN in 'popmode' corresponds to 0 workers so replace
modes_of_transportation = modes_of_transportation.fillna(value=0)

modes_of_transportation.head()

Unnamed: 0,mode,mode_name,pop_mode,county_fips,race_eth_code
0,WALK,Walked to work,0.0,6003.0,5
1,ATHOME,Worked at home,54.0,6001.0,1
2,ATHOME,Worked at home,3001.0,6001.0,2
3,ATHOME,Worked at home,2287.0,6001.0,3
4,ATHOME,Worked at home,2080.0,6001.0,4


In [110]:
# Create a table that documents road traffic injuries
# and is linked to the counties/race_eth tables 
traffic_injuries = traffic_injuries_df\
    [['severity', 'injuries', 'poprate', 'mode', 'county_fips', 'race_eth_code']].\
    ### We are only concerned with data corresponding to counties 
    ### so disregard rows with data corresponding to other geotypes
    loc[traffic_injuries_df['county_fips'].isna() == False]

# Reset index so that it starts from 0
traffic_injuries.reset_index(level=0, inplace=True); del traffic_injuries['index']

# NAN in 'popmode' corresponds to 0 workers so replace
traffic_injuries = traffic_injuries.fillna(value=0)
traffic_injuries.head()

Unnamed: 0,severity,injuries,poprate,mode,county_fips,race_eth_code
0,Killed,1.0,0.0,Vehicles,6057.0,9
1,Killed,7.0,0.0,All modes,6037.0,9
2,Severe Injury,44.0,0.0,All modes,6037.0,9
3,Severe Injury,1.0,0.0,Bicyclist,6037.0,9
4,Killed,3.0,0.0,Car/Pickup,6037.0,9


### Rename Tables and Columns 
- Make sure dataframe names are in accordance with SQL schema

In [123]:
counties = counties_pop_joined.rename(columns={'county_fips': 'county_fips_code',
                                               'totalpop': 'totalpop (10^5)'
                                              }
                                     )
counties.head()

Unnamed: 0,county_name,county_fips_code,region_code,totalpop (10^5)
0,Alameda,6001.0,1.0,4861.99
1,Alpine,6003.0,3.0,1.63
2,Amador,6005.0,3.0,64.62
3,Butte,6007.0,2.0,547.41
4,Calaveras,6009.0,3.0,73.69


In [124]:
regions = regions_joined

In [125]:
ethnicity = race_eth_joined

In [126]:
modes_of_transportation = modes_of_transportation.\
                            rename(columns ={'mode': 'transport_name',
                                             'mode_name': 'transport_description',
                                             'pop_mode': 'num_workers_per_transport',
                                             'county_fips': 'county_fips_code'
                                            }
                                  )
modes_of_transportation.head()

Unnamed: 0,transport_name,transport_description,num_workers_per_transport,county_fips_code,race_eth_code
0,WALK,Walked to work,0.0,6003.0,5
1,ATHOME,Worked at home,54.0,6001.0,1
2,ATHOME,Worked at home,3001.0,6001.0,2
3,ATHOME,Worked at home,2287.0,6001.0,3
4,ATHOME,Worked at home,2080.0,6001.0,4


In [127]:
traffic_injuries.columns
traffic_injuries = traffic_injuries.rename(columns={'poprate': 'rate_of_injuries_in_pop',
                                                    'mode': 'injury_transport_mode',
                                                    'county_fips': 'county_fips_code'
                                                   }
                                          )
traffic_injuries.head()

Unnamed: 0,severity,injuries,rate_of_injuries_in_pop,injury_transport_mode,county_fips_code,race_eth_code
0,Killed,1.0,0.0,Vehicles,6057.0,9
1,Killed,7.0,0.0,All modes,6037.0,9
2,Severe Injury,44.0,0.0,All modes,6037.0,9
3,Severe Injury,1.0,0.0,Bicyclist,6037.0,9
4,Killed,3.0,0.0,Car/Pickup,6037.0,9


## Load
- I've pasted in the sample code from Austin's lecture

### Connect to local database

In [None]:
rds_connection_string = "<insert user name>:<insert password>@localhost:5432/customer_db"
engine = create_engine(f'postgresql://{rds_connection_string}')

### Check for tables

In [None]:
engine.table_names()

### Use pandas to load csv converted DataFrame into database

In [None]:
new_customer_data_df.to_sql(name='customer_name', con=engine, if_exists='append', ind

### Query tables to confirm data has been added

In [None]:
pd.read_sql_query('select * from customer_name', con=engine).head()