In [1]:
# Import dependencies
import requests
import json
import ijson

In [2]:
# get the The dataset from the json url
r=requests.get('https://data.cdc.gov/api/views/ks3g-spdg/rows.json?')
data_json = r.json()

#preview json contents
# print(data_json)

In [3]:
# determine type of json object
print(type(data_json))

<class 'dict'>


In [4]:
# use json module to clean the data and transform it into a more readeable format
#data_str = json.dumps(data_json, indent=4, sort_keys=True)
data_str = json.dumps(data_json, indent=2)
# print(data_str)

In [5]:
print(data_json.keys())

dict_keys(['meta', 'data'])


In [6]:
data_json['meta']["view"]['columns']

[{'id': -1,
  'name': 'sid',
  'dataTypeName': 'meta_data',
  'fieldName': ':sid',
  'position': 0,
  'renderTypeName': 'meta_data',
  'format': {},
  'flags': ['hidden']},
 {'id': -1,
  'name': 'id',
  'dataTypeName': 'meta_data',
  'fieldName': ':id',
  'position': 0,
  'renderTypeName': 'meta_data',
  'format': {},
  'flags': ['hidden']},
 {'id': -1,
  'name': 'position',
  'dataTypeName': 'meta_data',
  'fieldName': ':position',
  'position': 0,
  'renderTypeName': 'meta_data',
  'format': {},
  'flags': ['hidden']},
 {'id': -1,
  'name': 'created_at',
  'dataTypeName': 'meta_data',
  'fieldName': ':created_at',
  'position': 0,
  'renderTypeName': 'meta_data',
  'format': {},
  'flags': ['hidden']},
 {'id': -1,
  'name': 'created_meta',
  'dataTypeName': 'meta_data',
  'fieldName': ':created_meta',
  'position': 0,
  'renderTypeName': 'meta_data',
  'format': {},
  'flags': ['hidden']},
 {'id': -1,
  'name': 'updated_at',
  'dataTypeName': 'meta_data',
  'fieldName': ':updated_at'

In [8]:
import json
with open('data.json', 'w', encoding='utf-8') as f:
    json.dump(data_json, f, ensure_ascii=False, indent=4)

In [9]:

import ijson
# specify file path we want to extract data from, then specify the keys to extract:
filename = "data.json"
with open(filename, 'r') as f:
    objects = ijson.items(f, 'meta.view.columns.item')
    columns = list(objects)
    print(columns[0])

{'id': -1, 'name': 'sid', 'dataTypeName': 'meta_data', 'fieldName': ':sid', 'position': 0, 'renderTypeName': 'meta_data', 'format': {}, 'flags': ['hidden']}


In [10]:
# extract a list of the columns
column_names = [col["fieldName"] for col in columns]
print (column_names)

[':sid', ':id', ':position', ':created_at', ':created_meta', ':updated_at', ':updated_meta', ':meta', 'data_as_of', 'start_week', 'end_week', 'state', 'age_group', 'race_and_hispanic_origin', 'covid_19_deaths', 'total_deaths', 'pneumonia_deaths', 'pneumonia_and_covid_19_deaths', 'influenza_deaths', 'pneumonia_influenza_or_covid', 'footnote']


In [11]:
import time
# filter data for the data fields we want
# we can use the column names we just extracted to grab the columns that are relevant.
data_columns = [
    "state",
    "age_group",    
    "race_and_hispanic_origin",
    "covid_19_deaths",
    "pneumonia_deaths",
    "influenza_deaths",
    "total_deaths",    
   ]

data = []
filename = "data.json"
with open(filename, 'r') as f:
    objects = ijson.items(f, 'data.item')
    for row in objects:
        selected_row = []
        for item in data_columns:
            selected_row.append(row[column_names.index(item)])
            data.append(selected_row)
time.sleep(5)
# data

In [12]:
data[0]

['United States',
 'All Ages',
 'All Race-Hisp',
 '54861',
 '78266',
 '6110',
 '835607']

In [13]:
# Convert to DataFrame
import pandas as pd

Covid_df = pd.DataFrame(data, columns=data_columns)

In [14]:
# Column Reordering
covid_df = Covid_df.rename(columns={
    "State": "state",
    "Age_Group": "age_group", 
    "Race_and_Hispanic_Origin": "race_and_hispanic_origin",
    "Covid_19_Deaths": "covid_19_deaths",
    "Pneumonia_Deaths": "pneumonia_deaths",
    "influenza_deaths": "influenza_deaths",
    "Total_Deaths": "total_deaths"
})

covid_df.head(20)

Unnamed: 0,state,age_group,race_and_hispanic_origin,covid_19_deaths,pneumonia_deaths,influenza_deaths,total_deaths
0,United States,All Ages,All Race-Hisp,54861,78266,6110,835607
1,United States,All Ages,All Race-Hisp,54861,78266,6110,835607
2,United States,All Ages,All Race-Hisp,54861,78266,6110,835607
3,United States,All Ages,All Race-Hisp,54861,78266,6110,835607
4,United States,All Ages,All Race-Hisp,54861,78266,6110,835607
5,United States,All Ages,All Race-Hisp,54861,78266,6110,835607
6,United States,All Ages,All Race-Hisp,54861,78266,6110,835607
7,United States,All Ages,Non-Hispanic White,28701,52954,4077,620534
8,United States,All Ages,Non-Hispanic White,28701,52954,4077,620534
9,United States,All Ages,Non-Hispanic White,28701,52954,4077,620534


In [15]:
# Create a filtered dataframe from specific columns
# covid_df.drop('United States') 
covid_df = covid_df[covid_df.state != 'United States']
covid_df.reset_index(drop= True, inplace=True)
covid_df

Unnamed: 0,state,age_group,race_and_hispanic_origin,covid_19_deaths,pneumonia_deaths,influenza_deaths,total_deaths
0,Alabama,Under 1 year,Non-Hispanic White,0,0,0,27
1,Alabama,Under 1 year,Non-Hispanic White,0,0,0,27
2,Alabama,Under 1 year,Non-Hispanic White,0,0,0,27
3,Alabama,Under 1 year,Non-Hispanic White,0,0,0,27
4,Alabama,Under 1 year,Non-Hispanic White,0,0,0,27
...,...,...,...,...,...,...,...
32643,Puerto Rico,85 years and over,Unknown,0,0,0,
32644,Puerto Rico,85 years and over,Unknown,0,0,0,
32645,Puerto Rico,85 years and over,Unknown,0,0,0,
32646,Puerto Rico,85 years and over,Unknown,0,0,0,


In [16]:
# Save as a csv
# Note to avoid any issues later, use encoding="utf-8"
covid_df.to_csv("covid_data_race.csv", encoding="utf-8", index=False)

# CSV DATA

In [17]:
import pandas as pd
from sqlalchemy import create_engine
import requests

In [18]:
csv_file = r"C:\Users\micha\OneDrive\Desktop\Proj\novel-corona-virus-2019-dataset\COVID-19.csv"
covid_data_df = pd.read_csv(csv_file)
covid_data_df.head()

Unnamed: 0,Data as of,Start week,End Week,State,Sex,Age group,COVID-19 Deaths,Total Deaths,Pneumonia Deaths,Pneumonia and COVID-19 Deaths,Influenza Deaths,"Pneumonia, Influenza, or COVID-19 Deaths",Footnote
0,5/6/2020,2/1/2020,5/2/2020,Alabama,Male,Under 1 year,0.0,39.0,0.0,0.0,,,
1,5/6/2020,2/1/2020,5/2/2020,Alabama,Male,1-4 years,0.0,,0.0,0.0,0.0,0.0,
2,5/6/2020,2/1/2020,5/2/2020,Alabama,Male,5-14 years,0.0,17.0,0.0,0.0,0.0,0.0,
3,5/6/2020,2/1/2020,5/2/2020,Alabama,Male,15-24 years,0.0,95.0,,0.0,,,
4,5/6/2020,2/1/2020,5/2/2020,Alabama,Male,25-34 years,,169.0,,0.0,,,


In [19]:
# Drop Columns

new_covid_data_df = covid_data_df[['State', 'Sex', 'Age group', 'COVID-19 Deaths', 'Pneumonia Deaths', 'Influenza Deaths', 'Total Deaths']].copy()
new_covid_data_df.head()

Unnamed: 0,State,Sex,Age group,COVID-19 Deaths,Pneumonia Deaths,Influenza Deaths,Total Deaths
0,Alabama,Male,Under 1 year,0.0,0.0,,39.0
1,Alabama,Male,1-4 years,0.0,0.0,0.0,
2,Alabama,Male,5-14 years,0.0,0.0,0.0,17.0
3,Alabama,Male,15-24 years,0.0,,,95.0
4,Alabama,Male,25-34 years,,,,169.0


In [20]:
# Rename Columns

covid_data_df2 = new_covid_data_df.rename(columns={"State": "state",
                                                         "Sex": "sex",
                                                         "Age group": "age",
                                                         "COVID-19 Deaths": "covid19_deaths",
                                                         "Pneumonia Deaths": "pneumonia_deaths",
                                                         "Influenza Deaths": "influenza_deaths",
                                                         "Total Deaths": "total_deaths"})
covid_data_df2.head()

Unnamed: 0,state,sex,age,covid19_deaths,pneumonia_deaths,influenza_deaths,total_deaths
0,Alabama,Male,Under 1 year,0.0,0.0,,39.0
1,Alabama,Male,1-4 years,0.0,0.0,0.0,
2,Alabama,Male,5-14 years,0.0,0.0,0.0,17.0
3,Alabama,Male,15-24 years,0.0,,,95.0
4,Alabama,Male,25-34 years,,,,169.0


In [21]:
# Cleaning data

covid_data_df2.fillna(0)

Unnamed: 0,state,sex,age,covid19_deaths,pneumonia_deaths,influenza_deaths,total_deaths
0,Alabama,Male,Under 1 year,0.0,0.0,0.0,39.0
1,Alabama,Male,1-4 years,0.0,0.0,0.0,0.0
2,Alabama,Male,5-14 years,0.0,0.0,0.0,17.0
3,Alabama,Male,15-24 years,0.0,0.0,0.0,95.0
4,Alabama,Male,25-34 years,0.0,0.0,0.0,169.0
...,...,...,...,...,...,...,...
1411,0,0,0,0.0,0.0,0.0,0.0
1412,0,0,0,0.0,0.0,0.0,0.0
1413,0,0,0,0.0,0.0,0.0,0.0
1414,0,0,0,0.0,0.0,0.0,0.0


In [22]:
#Create database connection
from sqlalchemy import create_engine

connection_string = "postgres:ophellia@localhost:5432/Total_Deaths_DB"
engine = create_engine(f'postgresql://{connection_string}')

In [23]:
#Load Json into database
covid_df.to_sql(name='covid19', con=engine, if_exists='append', index=False)

In [24]:
# Use pandas to load csv converted DataFrame into database

covid_data_df2.to_sql(name='covid_death', con=engine, if_exists='append', index=False)