In [13]:
import pandas as pd
import pandasql as ps
import datetime

import os

cwd = os.getcwd()
filepath = cwd + '/data/combined_ts.csv'
populationfilepath = cwd + '/data/populationdata.csv'
outputfilepath = cwd + '/data/coronavirus_final.csv'


df = pd.read_csv(filepath, delimiter=',')
population_df = pd.read_csv(populationfilepath, delimiter='|')

# Replace days with no results with 0s
df = df.fillna(0)

## Add datetime and date fields
df['datetime'] = pd.to_datetime(df['seconds_since_Epoch'], unit='s')
df['date'] = df['datetime'].dt.date




In [14]:
# Make it so there is only 1 record per day for each state 
query0 = """
Select abbr, date
    ,max(tested) as tested
    ,max(positive) as positive
    ,max(deaths) as deaths
    ,count(*) as daily_records
From df
group by abbr, date
"""

cleaned = ps.sqldf(query0, locals())

cleaned.head()


Unnamed: 0,abbr,date,tested,positive,deaths,daily_records
0,AK,2020-03-08,23.0,0.0,0.0,2
1,AK,2020-03-09,32.0,0.0,0.0,1
2,AK,2020-03-10,47.0,0.0,0.0,1
3,AK,2020-03-12,60.0,0.0,0.0,1
4,AK,2020-03-13,144.0,1.0,0.0,2


In [15]:
# Now we need to create a date table and join it to the population table
# We need to do this to ensure every state has a record for each day. 
now = datetime.date.today()
current_date = now.strftime("%Y-%m-%d")
date_df = pd.DataFrame({"Date": pd.date_range('2020-03-08',current_date)})
date_df['Date'] = date_df['Date'].dt.strftime("%Y-%m-%d")

# cross join these two tables
query_pop = """
select * from population_df, date_df
"""
population_cleaned = ps.sqldf(query_pop, locals())

population_cleaned.head()

Unnamed: 0,abbr,state,population,Date
0,AL,Alabama,4903185,2020-03-08
1,AL,Alabama,4903185,2020-03-09
2,AL,Alabama,4903185,2020-03-10
3,AL,Alabama,4903185,2020-03-11
4,AL,Alabama,4903185,2020-03-12


In [18]:
# Adding population to the Dataset and calculating the per million values
#pm means per million
# if a day doesn't have values for tested, positive, or deaths, we replace them with the previous days values
query1 = """
SELECT q2.abbr
    , q2.date
    , coalesce(q1.daily_records,0)
    , coalesce(q1.tested, lag(q1.tested,1) over (partition by q2.abbr order by q2.date )) as tested 
    , coalesce(q1.positive, lag(q1.positive,1) over (partition by q2.abbr order by q2.date )) as positive 
    , coalesce(q1.deaths, lag(q1.deaths,1) over (partition by q2.abbr order by q2.date )) as deaths 
    , q2.population 
    , 1000000/q2.population as multipliler
    , coalesce(q1.tested, lag(q1.tested,1) over (partition by q2.abbr order by q2.date )) * 1000000/q2.population as tested_pm
    , coalesce(q1.positive, lag(q1.positive,1) over (partition by q2.abbr order by q2.date )) * 1000000/q2.population as positive_pm
    , coalesce(q1.deaths, lag(q1.deaths,1) over (partition by q2.abbr order by q2.date )) * 1000000/q2.population as deaths_pm
FROM population_cleaned as q2
left outer join cleaned as q1
on q1.abbr = q2.abbr and q1.date=q2.date
"""

#creates the df from query 1 
result = ps.sqldf(query1, locals())

result.head()


Unnamed: 0,abbr,Date,"coalesce(q1.daily_records,0)",tested,positive,deaths,population,multipliler,tested_pm,positive_pm,deaths_pm
0,AK,2020-03-08,2,23.0,0.0,0.0,731545,1,31.440308,0.0,0.0
1,AK,2020-03-09,1,32.0,0.0,0.0,731545,1,43.743037,0.0,0.0
2,AK,2020-03-10,1,47.0,0.0,0.0,731545,1,64.247586,0.0,0.0
3,AK,2020-03-11,0,47.0,0.0,0.0,731545,1,64.247586,0.0,0.0
4,AK,2020-03-12,1,60.0,0.0,0.0,731545,1,82.018194,0.0,0.0
