In [15]:
# import dependencies 
import pandas as pd
import numpy as np
from config import pg_password
from sqlalchemy import create_engine

## Read in csv and covert to json to dataframe

In [16]:
# Read in csv 
covid_data_df = pd.read_csv(r"date,county,state,fips,cases,deaths.csv")

# Convert csv to_json and name covid_data_df
covid_data_df.to_json(r"covid_data.json")
covid_data_df

Unnamed: 0,date,county,State,fips,cases,deaths
0,2020-01-21,Snohomish,Washington,53061.0,1,0.0
1,2020-01-22,Snohomish,Washington,53061.0,1,0.0
2,2020-01-23,Snohomish,Washington,53061.0,1,0.0
3,2020-01-24,Cook,Illinois,17031.0,1,0.0
4,2020-01-24,Snohomish,Washington,53061.0,1,0.0
...,...,...,...,...,...,...
785515,2020-12-01,Ontonagon,Michigan,26131.0,280,14.0
785516,2020-12-01,Osceola,Michigan,26133.0,559,10.0
785517,2020-12-01,Oscoda,Michigan,26135.0,190,8.0
785518,2020-12-01,Otsego,Michigan,26137.0,874,18.0


## Clean and organinze data

In [17]:
# Create dataFrame for only California 
covid_cali_df = covid_data_df.loc[covid_data_df["State"] == "California"]
covid_cali_df.head()

Unnamed: 0,date,county,State,fips,cases,deaths
5,2020-01-25,Orange,California,6059.0,1,0.0
9,2020-01-26,Los Angeles,California,6037.0,1,0.0
10,2020-01-26,Orange,California,6059.0,1,0.0
14,2020-01-27,Los Angeles,California,6037.0,1,0.0
15,2020-01-27,Orange,California,6059.0,1,0.0


In [18]:
# Get county names 
covid_cali_df.county.unique()

array(['Orange', 'Los Angeles', 'Santa Clara', 'San Francisco',
       'San Diego', 'Humboldt', 'Sacramento', 'Solano', 'Marin', 'Napa',
       'Sonoma', 'Alameda', 'Placer', 'San Mateo', 'Contra Costa', 'Yolo',
       'Fresno', 'Madera', 'Riverside', 'Santa Cruz', 'Shasta',
       'San Joaquin', 'Ventura', 'Stanislaus', 'Tulare', 'San Benito',
       'San Luis Obispo', 'San Bernardino', 'Santa Barbara', 'Nevada',
       'Kern', 'Monterey', 'Mendocino', 'Amador', 'Imperial', 'Butte',
       'El Dorado', 'Siskiyou', 'Yuba', 'Unknown', 'Calaveras', 'Merced',
       'Mono', 'Inyo', 'Sutter', 'Colusa', 'Kings', 'Glenn', 'Tuolumne',
       'Alpine', 'Plumas', 'Del Norte', 'Tehama', 'Lake', 'Mariposa',
       'Trinity', 'Sierra', 'Lassen', 'Modoc'], dtype=object)

In [19]:
# Clean data -  remove 'unknown'
clean_covid_df = covid_cali_df[covid_cali_df.county != 'Unknown']
clean_covid_df.head()

Unnamed: 0,date,county,State,fips,cases,deaths
5,2020-01-25,Orange,California,6059.0,1,0.0
9,2020-01-26,Los Angeles,California,6037.0,1,0.0
10,2020-01-26,Orange,California,6059.0,1,0.0
14,2020-01-27,Los Angeles,California,6037.0,1,0.0
15,2020-01-27,Orange,California,6059.0,1,0.0


In [20]:
# Confirm that only 58 counties in California
clean_covid_df.county.nunique()

58

## Create dataframes for socal and nocal counties

In [21]:
# Create socalArray for dataFrame with socal counties only
socalArray = ["Inyo", "Kern", "San Luis Obispo", "Santa Barbara", "Ventura", "Los Angeles", "San Bernardino", "Orange", "Riverside", "San Diego", "Imperial"]

# Creating only socal_data_df by selecting socal counties with isin
socal_data_df = clean_covid_df.loc[clean_covid_df["county"].isin(socalArray)]
socal_data_df.head()

Unnamed: 0,date,county,State,fips,cases,deaths
5,2020-01-25,Orange,California,6059.0,1,0.0
9,2020-01-26,Los Angeles,California,6037.0,1,0.0
10,2020-01-26,Orange,California,6059.0,1,0.0
14,2020-01-27,Los Angeles,California,6037.0,1,0.0
15,2020-01-27,Orange,California,6059.0,1,0.0


In [22]:
# Confirm that only the 11 socal counties are in dataFrame
socal_data_df.nunique()

date       312
county      11
State        1
fips        11
cases     2332
deaths     905
dtype: int64

In [23]:
# Using socal_data_df, merge with clean_covid_cali_df and create duplicates for socal counties
duplicates = pd.merge(clean_covid_df, socal_data_df, how= 'inner', left_on=['date', 'county'], right_on=['date', 'county'], left_index=True)

# Use .drop duplicates on index to create nocal_data_df
norcal_data_df = clean_covid_df.drop(duplicates.index)
norcal_data_df.head()

Unnamed: 0,date,county,State,fips,cases,deaths
36,2020-01-31,Santa Clara,California,6085.0,1,0.0
42,2020-02-01,Santa Clara,California,6085.0,1,0.0
49,2020-02-02,San Francisco,California,6075.0,2,0.0
50,2020-02-02,Santa Clara,California,6085.0,2,0.0
57,2020-02-03,San Francisco,California,6075.0,2,0.0


In [24]:
# Confirm that only 47 counties for nocal
norcal_data_df.nunique()

date       306
county      47
State        1
fips        47
cases     4297
deaths     465
dtype: int64

## Make connection to PGAdmin

In [25]:
#clean_covid_df.dtypes

In [26]:
# Make connection to telecommunication_db in postgresql
connection_string = f"postgres:{pg_password}@localhost:5432/covid_mask_effect_db"
engine = create_engine(f'postgresql://{connection_string}')

In [27]:
# Read in tables_names() from pg_admin
engine.table_names()

[]

## Send dataframe to PGAdmin

In [28]:
clean_covid_df.to_sql(name='cali_data', con=engine, if_exists='append', index=False)

In [29]:
socal_data_df.to_sql(name='socal_data', con=engine, if_exists='append', index=False)

In [30]:
norcal_data_df.to_sql(name='nocal_data', con=engine, if_exists='append', index=False)

## Read in tables from pgadmin to confirm data was received correclty

In [31]:
pd.read_sql_query('SELECT * FROM cali_data', con=engine)

Unnamed: 0,date,county,State,fips,cases,deaths
0,2020-01-25,Orange,California,6059.0,1,0.0
1,2020-01-26,Los Angeles,California,6037.0,1,0.0
2,2020-01-26,Orange,California,6059.0,1,0.0
3,2020-01-27,Los Angeles,California,6037.0,1,0.0
4,2020-01-27,Orange,California,6059.0,1,0.0
...,...,...,...,...,...,...
15047,2020-12-01,Tulare,California,6107.0,21332,312.0
15048,2020-12-01,Tuolumne,California,6109.0,1104,9.0
15049,2020-12-01,Ventura,California,6111.0,20308,185.0
15050,2020-12-01,Yolo,California,6113.0,4930,77.0


In [32]:
pd.read_sql_query('SELECT * FROM socal_data', con=engine)

Unnamed: 0,date,county,State,fips,cases,deaths
0,2020-01-25,Orange,California,6059.0,1,0.0
1,2020-01-26,Los Angeles,California,6037.0,1,0.0
2,2020-01-26,Orange,California,6059.0,1,0.0
3,2020-01-27,Los Angeles,California,6037.0,1,0.0
4,2020-01-27,Orange,California,6059.0,1,0.0
...,...,...,...,...,...,...
3007,2020-12-01,San Bernardino,California,6071.0,94106,1129.0
3008,2020-12-01,San Diego,California,6073.0,83484,1019.0
3009,2020-12-01,San Luis Obispo,California,6079.0,6345,38.0
3010,2020-12-01,Santa Barbara,California,6083.0,11631,137.0


In [33]:
pd.read_sql_query('SELECT * FROM nocal_data', con=engine)

Unnamed: 0,date,county,State,fips,cases,deaths
0,2020-01-31,Santa Clara,California,6085.0,1,0.0
1,2020-02-01,Santa Clara,California,6085.0,1,0.0
2,2020-02-02,San Francisco,California,6075.0,2,0.0
3,2020-02-02,Santa Clara,California,6085.0,2,0.0
4,2020-02-03,San Francisco,California,6075.0,2,0.0
...,...,...,...,...,...,...
12035,2020-12-01,Trinity,California,6105.0,163,0.0
12036,2020-12-01,Tulare,California,6107.0,21332,312.0
12037,2020-12-01,Tuolumne,California,6109.0,1104,9.0
12038,2020-12-01,Yolo,California,6113.0,4930,77.0


In [None]:
clean_covid_df.to_csv('../output/clean_covid_df.csv')

In [None]:
socal_data_df.to_csv('../output/socal_data_df.csv')

In [None]:
norcal_data_df.to_csv('../output/norcal_data_df.csv')