# Transportation in California Counties 

## Preliminaries

In [None]:
# Dependencies
import pylab as plt
import pandas as pd
import numpy as np
import os
import time
from sqlalchemy import create_engine

## Extract

In [None]:
# File Paths to CSVs we are importing
traffic_modes_path = os.path.join("Resources", "transportation-to-work.csv")
traffic_injuries_path = os.path.join("Resources", "road-traffic-injuries.csv")

# Use Pandas to read CSV files
traffic_modes_df = pd.read_csv(traffic_modes_path, low_memory=False)
traffic_injuries_df = pd.read_csv(traffic_injuries_path, low_memory=False)

## Transform
### Group Datasets to Build Category Specific Tables

In [None]:
# Build reference tables orgnaized by county, region, transportation modes, and race and ethnicity
### Tables for traffic_modes_df
TM_counties_df = pd.DataFrame\
    (traffic_modes_df.groupby(['county_name', 'county_fips', 'region_code']).size())

TM_regions_df = pd.DataFrame\
    (traffic_modes_df.groupby(['region_name', 'region_code']).size())

TM_transport_modes_df = pd.DataFrame\
    (traffic_modes_df.groupby(['mode', 'mode_name']).size()) 

TM_race_eth_df = pd.DataFrame\
    (traffic_modes_df.groupby(['race_eth_name', 'race_eth_code']).size())  

### Tables for traffic_injuries_df
TI_counties_df = pd.DataFrame\
    (traffic_injuries_df.groupby(['county_name', 'county_fips', 'region_code']).size())

TI_regions_df = pd.DataFrame\
    (traffic_injuries_df.groupby(['region_name', 'region_code']).size())

TI_transport_modes_df = pd.DataFrame\
    (traffic_injuries_df.groupby(['mode']).size())                           

TI_race_eth_df = pd.DataFrame\
    (traffic_injuries_df.groupby(['race_eth_name', 'race_eth_code']).size())

In [None]:
# Delete unecessary columns and reset indices
df_list = [TM_counties_df, TM_regions_df, 
           TM_transport_modes_df, TM_race_eth_df,
           TI_counties_df, TI_regions_df, 
           TI_transport_modes_df, TI_race_eth_df]

for df in df_list:
    df.reset_index(inplace=True)
    del df[0]

In [None]:
# Join traffic_modes tables to traffic_injuries table for all categories accept transportation modes
counties_joined = pd.merge(TM_counties_df, TI_counties_df, \
                           on=['county_name', 'county_fips', 'region_code'], how='outer')

regions_joined = pd.merge(TM_regions_df, TI_regions_df, \
                          on=['region_name', 'region_code'], how='outer')

race_eth_joined = pd.merge(TM_race_eth_df, TI_race_eth_df, \
                           on=['race_eth_name', 'race_eth_code'], how='outer')

In [None]:
# We want to provide total population per county, only traffic_injuries_df supplies this info
total_pop_per_county = traffic_injuries_df\
    [['county_name','totalpop']].groupby(['county_name']).sum()

# Reset the index so that 'county_name' is treated as a column
total_pop_per_county.reset_index(inplace=True)
total_pop_per_county.head()

In [None]:
# Add totalpop to counties_joined
counties_pop_joined = pd.merge\
    (counties_joined, total_pop_per_county, on='county_name', how='outer')

# Adjust number format of 'totalpop' column
counties_pop_joined['totalpop'] = counties_pop_joined['totalpop']/(10**5)
counties_pop_joined['totalpop'] = counties_pop_joined['totalpop'].round(2)

counties_pop_joined.head()

### Filter Datasets for Desired Statistics

In [None]:
# Create a table that documents workers modes of transportation
# and is linked to the counties/race_eth tables 
### We are only concerned with data corresponding to counties 
    ### so disregard rows with data corresponding to other geotypes

modes_of_transportation = traffic_modes_df\
    [['mode','mode_name', 'pop_mode','county_fips', 'race_eth_code']].\
loc[traffic_modes_df['county_fips'].isna() == False]

# Reset index so that it starts from 0
modes_of_transportation.reset_index(level=0, inplace=True); del modes_of_transportation['index']

# NAN in 'popmode' corresponds to 0 workers so replace
modes_of_transportation = modes_of_transportation.fillna(value=0)

modes_of_transportation.head()

In [None]:
# Create a table that documents road traffic injuries
# and is linked to the counties/race_eth tables 

### We are only concerned with data corresponding to counties 
    ### so disregard rows with data corresponding to other geotypes

traffic_injuries = traffic_injuries_df\
    [['severity', 'injuries', 'poprate', 'mode', 'county_fips', 'race_eth_code']].\
loc[traffic_injuries_df['county_fips'].isna() == False]

# Reset index so that it starts from 0
traffic_injuries.reset_index(level=0, inplace=True); del traffic_injuries['index']

# NAN in 'popmode' corresponds to 0 workers so replace
traffic_injuries = traffic_injuries.fillna(value=0)
traffic_injuries.head()

### Rename Tables and Columns 
- Make sure dataframe names are in accordance with SQL schema

In [None]:
counties = counties_pop_joined.rename(columns={'county_fips': 'county_fips_code',
                                               'totalpop': 'totalpop'
                                              }
                                     )
counties.head()

In [None]:
regions = regions_joined
regions.head()

In [None]:
ethnicity = race_eth_joined

In [None]:
modes_of_transportation = modes_of_transportation.\
                            rename(columns ={'mode': 'transport_name',
                                             'mode_name': 'transport_description',
                                             'pop_mode': 'num_workers_per_transport',
                                             'county_fips': 'county_fips_code'
                                            }
                                  )
modes_of_transportation.head()

In [None]:
traffic_injuries.columns
traffic_injuries = traffic_injuries.rename(columns={'poprate': 'rate_of_injuries_in_pop',
                                                    'mode': 'injury_transport_mode',
                                                    'county_fips': 'county_fips_code'
                                                   }
                                          )
traffic_injuries.head()

## Load

### Connect to local database

In [None]:
#Insert credentials for DB connection
rds_connection_string = "<USERNAME:PASSWORD>@localhost:5432/etl_project2"
engine = create_engine(f'postgresql://{rds_connection_string}')


### Check for tables

In [None]:
engine.table_names()

### Use pandas to load csv converted DataFrame into database

In [None]:
regions.to_sql(name='regions', con=engine, if_exists='append', index=False)

In [None]:
counties.to_sql(name='counties', con=engine, if_exists='append', index=False)

In [None]:
ethnicity.to_sql(name='ethnicity', con=engine, if_exists='append', index=False)

In [None]:
traffic_injuries.to_sql(name='traffic_injuries', con=engine, if_exists='append', index=False)

In [None]:
modes_of_transportation.to_sql(name='modes_of_transportation', con=engine, if_exists='append', index=False)

### Query tables to confirm data has been added

In [None]:
pd.read_sql_query('select * from regions', con=engine).head()