In [301]:
"""
Author: Seung Won Joeng
Modifier: Kwanyeob Jung, Jaeshin Cho
"""
import numpy as np
import pandas as pd
import datetime as dt
%matplotlib inline                                  
import matplotlib.pyplot as plt


In [302]:
# Read csv from current path.
def read_csv():
    df1 = pd.read_csv('2020_US_weekly_symptoms_dataset.csv')
    df2 = pd.read_csv('aggregated_cc_by.csv')
    return df1, df2

df1, df2 = read_csv()
print('Type of df1: ', type(df1))
print('Type of df2: ', type(df2))
print('Shape of df1: ', df1.shape);
print('Shape of df2: ', df2.shape);

Type of df1:  <class 'pandas.core.frame.DataFrame'>
Type of df2:  <class 'pandas.core.frame.DataFrame'>
Shape of df1:  (624, 430)
Shape of df2:  (98434, 62)


  if (await self.run_code(code, result,  async_=asy)):


In [303]:
# Since we are going to handle regions in USA from 2020_US_weekly_symptoms_dataset.csv
# To extract the records of USA only from dataframe
def extract_from_aggregated(df1, df2):
    # Get all region codes 
    region_codes = df1.open_covid_region_code.unique()
    result = df2[df2['open_covid_region_code'].isin(region_codes)]
    result = result.reset_index(drop = True)
    result = result[['open_covid_region_code', 'region_name', 'date', 'hospitalized_new']]
    return result

df2 = extract_from_aggregated(df1, df2)
print('Shape of df2: ', df2.shape)
print(df2[:100])


Shape of df2:  (3458, 4)
   open_covid_region_code region_name        date  hospitalized_new
0                   US-WY     Wyoming  2020/03/07               0.0
1                   US-WY     Wyoming  2020/03/08               0.0
2                   US-WY     Wyoming  2020/03/09               0.0
3                   US-WY     Wyoming  2020/03/10               0.0
4                   US-WY     Wyoming  2020/03/11               0.0
5                   US-WY     Wyoming  2020/03/12               0.0
6                   US-WY     Wyoming  2020/03/13               0.0
7                   US-WY     Wyoming  2020/03/14               0.0
8                   US-WY     Wyoming  2020/03/15               0.0
9                   US-WY     Wyoming  2020/03/16               0.0
10                  US-WY     Wyoming  2020/03/17               0.0
11                  US-WY     Wyoming  2020/03/18               0.0
12                  US-WY     Wyoming  2020/03/19               0.0
13                  US-

In [304]:
# Clean data in threshold num_rows * 0.05 and num_cols * 0.05
def clean_dataframe(df):
    num_rows,num_cols = df1.shape
    thresh_rows = int(num_rows * 0.03);
    thresh_cols = int(num_cols * 0.03);

    df = df.dropna(axis = 1, thresh = thresh_rows)
    df = df.dropna(axis = 0, thresh = thresh_cols)
    df = df.reset_index(drop = True)
    return df;

# df1 = clean_dataframe(df1)
# df2 = clean_dataframe(df2)

# print('Shape of df1: ', df1.shape);
# print('Shape of df2: ', df2.shape);
# print(df2)

In [308]:
def convert_to_datetime(df1, df2):
    df1['date'] = pd.to_datetime(df1.date)
    df2['date'] = pd.to_datetime(df2.date)
    return df1,df2

def daily_to_weekly(df2):
    df2['date'] = df2['date'] - pd.to_timedelta(7, unit='d')
    df2 = df2.groupby(['open_covid_region_code', pd.Grouper(key='date', freq='W-MON')])['hospitalized_new'].sum().reset_index().sort_values(['open_covid_region_code', 'date'])
    return df2

def merge_two_dfs(df1, df2):
    result = df1.merge(df2, how='inner', on=['open_covid_region_code', 'date'])
    return result

df1, df2 = convert_to_datetime(df1,df2)
df2 = daily_to_weekly(df2)

df_final = merge_two_dfs(df1,df2)
print(df_final.head(10))
print('--------------------------------')
df_final = clean_dataframe(df_final)
print(df_final.head(10))

  open_covid_region_code country_region_code country_region sub_region_1  \
0                  US-AK                  US  United States       Alaska   
1                  US-AK                  US  United States       Alaska   
2                  US-AK                  US  United States       Alaska   
3                  US-AK                  US  United States       Alaska   
4                  US-AK                  US  United States       Alaska   
5                  US-AK                  US  United States       Alaska   
6                  US-AK                  US  United States       Alaska   
7                  US-AK                  US  United States       Alaska   
8                  US-AK                  US  United States       Alaska   
9                  US-AK                  US  United States       Alaska   

  sub_region_1_code  sub_region_2  sub_region_2_code       date  \
0             US-AK           NaN                NaN 2020-02-10   
1             US-AK          

In [291]:
"""
Function merge_data takes two dataframes and merge two dataframes as one dataframe with weekly records.
Parameter df1 corresponds to 2020_US_weekly_symptoms_dataset.csv which is weekly records
Parameter df2 corresponds to aggregated_cc_by.csv which is daily records
"""

# def same_week(d1, d2):
#     return (d1.isocalendar()[1] == d2.isocalendar()[1])

# def convert_datetime(df1, df2):
#     df1['date'] = pd.to_datetime(df1.date)
#     df2['date'] = pd.to_datetime(df2.date)
#     return df1,df2

# def daily_to_weekly(df2):
#     df2['date'] = df2['date'] - pd.to_timedelta(7, unit='days')
#     df2 = df2.groupby(['open_covid_region_code', pd.Grouper(key='date', freq='W-MON')])['hospitalized_new'].sum().reset_index().sort_values(['open_covid_region_code', 'date'])
#     return df2

# def clean_date(df1):
#     values = ['2020-01-06', '2020-01-13', '2020-01-20', '2020-01-27', '2020-02-03']
#     #result = df2[~df2['date'].isin(values)]
#     return result
#     # df2[df2['open_covid_region_code'].isin(region_codes)]


# def merge_data(df1, df2):
#     result = df1.merge(df2, how='inner', on=['open_covid_region_code', 'date'])
#     return result

# df1, df2 = convert_datetime(df1,df2)
# df2 = daily_to_weekly(df2)
# df1 = clean_date(df1)
# print('--------------------------------')
# print(df2)
# print('--------------------------------')
# print(df1)
# print('--------------------------------')
# final_data = pd.merge(df1, df2, how='left', on=['open_covid_region_code', 'date'])
# print(final_data.head(15))


NameError: name 'result' is not defined