In [234]:
"""
Author: Seung Won Joeng
Modifier: Kwanyeob Jung, Jaeshin Cho
"""
import numpy as np
import pandas as pd
import datetime as dt
from datetime import datetime
%matplotlib inline                                  
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing


In [235]:
"""
            TASK 1-2
"""

# Read csv from current path.
def read_csv():
    df1 = pd.read_csv('2020_US_weekly_symptoms_dataset.csv')
    df2 = pd.read_csv('aggregated_cc_by.csv')
    return df1, df2

df1, df2 = read_csv()
print(df1.shape)
print(df2.shape)

(624, 430)
(98434, 62)


  if (await self.run_code(code, result,  async_=asy)):


In [236]:
"""
            TASK 1-3
"""

# Since we are going to handle regions in USA from 2020_US_weekly_symptoms_dataset.csv
# To extract the records of USA only from dataframe
def extract_from_aggregated(df1, df2):
    # Get all region codes 
    region_codes = df1.open_covid_region_code.unique()
    result = df2[df2['open_covid_region_code'].isin(region_codes)]
    result = result.reset_index(drop = True)
    result = result[['open_covid_region_code', 'region_name', 'date', 'hospitalized_new']]
    return result

df2 = extract_from_aggregated(df1,df2)
print(df2.shape)

(3458, 4)


In [237]:
# Clean data in threshold num_rows * 0.05 and num_cols * 0.05
def clean_dataframe(df):
    num_rows,num_cols = df1.shape
    thresh_rows = int(num_rows * 0.05);
    thresh_cols = int(num_cols * 0.05);
    df = df.dropna(axis = 1, thresh = thresh_rows)
    df = df.dropna(axis = 0, thresh = thresh_cols)
    df = df.reset_index(drop = True)
    return df;

df1 = clean_dataframe(df1)
print('after clean: ', df1.shape)
print(df1.head())

after clean:  (429, 127)
  open_covid_region_code country_region_code country_region sub_region_1  \
0                  US-AK                  US  United States       Alaska   
1                  US-AK                  US  United States       Alaska   
2                  US-AK                  US  United States       Alaska   
3                  US-AK                  US  United States       Alaska   
4                  US-AK                  US  United States       Alaska   

  sub_region_1_code        date  symptom:Adrenal crisis  symptom:Ageusia  \
0             US-AK  2020-01-06                   12.69              NaN   
1             US-AK  2020-01-13                    9.56              NaN   
2             US-AK  2020-01-20                     NaN              NaN   
3             US-AK  2020-01-27                   15.31             7.47   
4             US-AK  2020-02-03                    8.81              NaN   

   symptom:Allergic conjunctivitis  symptom:Amblyopia  ...  \

In [238]:
def convert_to_datetime(df1, df2):
    df1['date'] = pd.to_datetime(df1.date)
    df2['date'] = pd.to_datetime(df2.date)
    return df1,df2

df1, df2 = convert_to_datetime(df1,df2)

In [239]:
def daily_to_weekly(df2):
    df2['date'] = df2['date'] - pd.to_timedelta(7, unit='d')
    df2 = df2.groupby(['open_covid_region_code', pd.Grouper(key='date', freq='W-MON')])['hospitalized_new'].sum().reset_index().sort_values(['open_covid_region_code', 'date'])
    return df2

print(df2['date'])
df2 = daily_to_weekly(df2)
print(df2['date'])

0      2020-03-07
1      2020-03-08
2      2020-03-09
3      2020-03-10
4      2020-03-11
          ...    
3453   2020-10-03
3454   2020-10-04
3455   2020-10-05
3456   2020-10-06
3457   2020-10-07
Name: date, Length: 3458, dtype: datetime64[ns]
0     2020-03-02
1     2020-03-09
2     2020-03-16
3     2020-03-23
4     2020-03-30
         ...    
508   2020-09-07
509   2020-09-14
510   2020-09-21
511   2020-09-28
512   2020-10-05
Name: date, Length: 513, dtype: datetime64[ns]


In [240]:
"""
            TASK 1-4
"""
# Merge two data, delete unnecessary columns
def merge_two_dfs(df1, df2):
    result = df1.merge(df2, how='inner', on=['open_covid_region_code', 'date'])
    result = result.reset_index(drop = True)
    result.drop('country_region', axis='columns', inplace=True)
    result = result.fillna(0)
    return result

final = merge_two_dfs(df1,df2)
print(final.head())

  open_covid_region_code country_region_code sub_region_1 sub_region_1_code  \
0                  US-AK                  US       Alaska             US-AK   
1                  US-AK                  US       Alaska             US-AK   
2                  US-AK                  US       Alaska             US-AK   
3                  US-AK                  US       Alaska             US-AK   
4                  US-AK                  US       Alaska             US-AK   

        date  symptom:Adrenal crisis  symptom:Ageusia  \
0 2020-03-02                   14.62             0.00   
1 2020-03-09                   10.60             0.00   
2 2020-03-16                   11.69             0.00   
3 2020-03-23                   11.15            16.57   
4 2020-03-30                    8.96             8.96   

   symptom:Allergic conjunctivitis  symptom:Amblyopia  symptom:Amenorrhea  \
0                             0.00                0.0                9.97   
1                           

In [241]:
#### TASK 2!!!!!!!!!!

In [242]:
"""
            TASK 3
"""

# df: mereged data
# Case1) To keep all data from some regions in the val set and train on the rest
def split_data_regions(df):
    regions = df.open_covid_region_code.unique()
    col_name = 'open_covid_region_code'
    kf = KFold()
    for train_index, val_index in kf.split(regions):
        
        train, validation = regions[train_index], regions[val_index]
        
        train_df = df[df[col_name].isin(train)]
        val_df = df[df[col_name].isin(validation)]
        
        X_train = train_df.iloc[:, 5:-1]
        Y_train = train_df['hospitalized_new']
#         print("X_train: \n", X_train.head(3))
#         print("Y_train: \n", Y_train.head(3))
        
        X_val = val_df.iloc[:, 5:-1]
        Y_val = val_df['hospitalized_new']
#         print("X_validation: \n", X_val.head(3))
#         print("Y_validation: \n", Y_val.head(3))
#         X_train = preprocessing.StandardScaler().fit(X_train).transform(X_train.astype(float))
#         X_val = preprocessing.StandardScaler().fit(X_train).transform(X_train.astype(float))
#         Y_train = preprocessing.StandardScaler().fit(X_train).transform(X_train.astype(float))
#         Y_val = preprocessing.StandardScaler().fit(X_train).transform(X_train.astype(float))
        yield X_train, Y_train, X_val, Y_val, train

    
# Case2) To keep data for the last couple of timepoints (keep data after 2020-08-10) from all regions in the va
# set and train on the rest
# df: merged data // date: date as string
def split_data_time(df, d = '2020-08-01'):
    # Convert to datetime from String
    date = datetime.strptime(d, '%Y-%m-%d')
    
    train_df = df[df["date"] <= date]
    val_df = df[df["date"] > date]
    
    X_train = train_df.iloc[:, 5:-1]
    Y_train = train_df['hospitalized_new']
    
    X_val = val_df.iloc[:, 5:-1]
    Y_val = val_df['hospitalized_new']
    
    yield X_train, Y_train, X_val, Y_val, train_df['date']



In [243]:
"""
            TASK 3: KNeighborsRegressor
            @Params: k:int, X_train, Y_train, X_val: dataframe
"""

def KNN_regression(k, X_train, Y_train, X_val):
    neigh = KNeighborsRegressor(n_neighbors=k)
    neigh.fit(X_train, Y_train)
    pred = neigh.predict(X_val)
    return pred


"""
            TASK 3: DecisionTreeRegressor
            @Params: l:int, X_train, Y_train, X_val: dataframe
"""
def DecisionTree_regression(l, X_train, Y_train, X_val):
    model = DecisionTreeRegressor(min_samples_leaf=l)
    model.fit(X_train, Y_train)
    pred = model.predict(X_val)
    return pred


In [244]:
"""
            TASK 3: KNeighborsRegressor based on regions
"""

errors_knn = []
fold_regions = []

for n in range(150):
    for X_train, Y_train, X_val, Y_val, train in split_data_regions(final):
        pred = KNN_regression(n+1, X_train, Y_train, X_val)
        mse = mean_squared_error(Y_val, pred)
        errors_knn.append(mse)
        fold_regions.append(train)

err = np.array(errors_knn)
min_index = np.argmin(err)
k = (min_index // 5) + 1
min_err = err[min_index]
min_reg = fold_regions[min_index]

print('[BASED ON REGION] K: ', k)
print('[BASED ON REGION] Minimum MSE: ', min_err)
print('[BASED ON REGION] Regions used in Train: \n', min_reg)



# def KNN_regression_regions(df):
#     k = [i for i in range(1, 151)]
#     errors_knn = np.zeros(150)
#     fold_regions = []
    
#     for n in k:
#         for X_train, Y_train, X_val, Y_val, train in split_data_regions(df):
#             pred = KNN_regression(n, X_train, Y_train, X_val)
#             mse = mean_squared_error(Y_val, pred)
#             errors_knn[n-1] = mse
#             fold_regions.append(train)
    
#     index = np.argmin(errors_knn)
#     min_mse = errors_knn[index]
#     min_region_train = fold_regions[index]
#     print(errors_knn)

#     return index, min_mse, min_region_train


"""
            TASK 3: KNeighborsRegressor based on times
"""

errors_knn = []
fold_times = []

for n in range(150):
    for X_train, Y_train, X_val, Y_val, train in split_data_time(final):
        pred = KNN_regression(n+1, X_train, Y_train, X_val)
        mse = mean_squared_error(Y_val, pred)
        errors_knn.append(mse)
        fold_times.append(train)

err = np.array(errors_knn)
min_index = np.argmin(err)
k = (min_index // 5) + 1
min_err = err[min_index]
min_reg = fold_regions[min_index]

print('\n')
print('[BASED ON TIME] K: ', k)
print('[BASED ON TIME] Minimum MSE: ', min_err)
print('[BASED ON TIME] Dates used in Train: \n', min_reg)

# def KNN_regression_times(df):
#     k = [i for i in range(1, 151)]
#     errors_knn = np.zeros(150)
#     fold_regions = []
    
#     for n in k:
#         for X_train, Y_train, X_val, Y_val, train in split_data_time(df):
#             pred = KNN_regression(n, X_train, Y_train, X_val)
#             mse = mean_squared_error(Y_val, pred)
#             errors_knn[n-1] = mse
#             fold_regions.append(train)
    
#     index = np.argmin(errors_knn)
#     min_mse = errors_knn[index]
#     min_date_train = fold_regions[index] 
    
#     return index, min_mse, min_date_train


# index, min_mse, regions_train = KNN_regression_regions(final)
# print('[BASED ON REGION] K: ', index + 1)
# print('[BASED ON REGION] Minimum MSE: \n', min_mse)
# print('[BASED ON REGION] Regions used in Train: \n', regions_train)

# index, min_mse, dates_train = KNN_regression_times(final)
# print('[BASED ON TIME] K: ', index + 1)
# print('[BASED ON TIME] Minimum MSE: \n', min_mse)
# print('[BASED ON TIME] Regions used in Train: \n', dates_train)


[BASED ON REGION] K:  150
[BASED ON REGION] Minimum MSE:  473.2164243727599
[BASED ON REGION] Regions used in Train: 
 ['US-AK' 'US-DC' 'US-DE' 'US-ND' 'US-RI' 'US-SD' 'US-VT' 'US-WV' 'US-WY']


[BASED ON TIME] K:  2
[BASED ON TIME] Minimum MSE:  1090.1551515151514
[BASED ON TIME] Dates used in Train: 
 ['US-AK' 'US-DC' 'US-DE' 'US-ME' 'US-MT' 'US-ND' 'US-RI' 'US-SD' 'US-VT']


In [247]:
"""
            TASK 3: DecisionTreeRegressor based on regions
"""

errors_dt = []
fold_regions = []

for l in range(150):
    for X_train, Y_train, X_val, Y_val, train in split_data_regions(final):
        pred = DecisionTree_regression(l+1, X_train, Y_train, X_val)
        mse = mean_squared_error(Y_val, pred)
        errors_dt.append(mse)
        fold_regions.append(train)

err = np.array(errors_dt)
min_index = np.argmin(err)
l = (min_index // 5) + 1
min_err = err[min_index]
min_reg = fold_regions[min_index]

print('[BASED ON REGION] L: ', l)
print('[BASED ON REGION] Minimum MSE: ', min_err)
print('[BASED ON REGION] Regions used in Train: \n', min_reg)



"""
            TASK 3: DecisionTreeRegressor based on times
"""

errors_dt = []
fold_regions = []

for l in range(150):
    for X_train, Y_train, X_val, Y_val, train in split_data_time(final):
        pred = DecisionTree_regression(l+1, X_train, Y_train, X_val)
        mse = mean_squared_error(Y_val, pred)
        errors_dt.append(mse)
        fold_regions.append(train)

err = np.array(errors_dt)
min_index = np.argmin(err)
l = (min_index // 5) + 1
min_err = err[min_index]
min_reg = fold_regions[min_index]

print('\n')
print('[BASED ON TIME] L: ', l)
print('[BASED ON TIME] Minimum MSE: ', min_err)
print('[BASED ON TIME] Regions used in Train: \n', min_reg)

[BASED ON REGION] L:  15
[BASED ON REGION] Minimum MSE:  425.3120482471815
[BASED ON REGION] Regions used in Train: 
 ['US-AK' 'US-DC' 'US-DE' 'US-ND' 'US-RI' 'US-SD' 'US-VT' 'US-WV' 'US-WY']


[BASED ON TIME] L:  7
[BASED ON TIME] Minimum MSE:  1080.2631105429753
[BASED ON TIME] Regions used in Train: 
 0     2020-03-02
1     2020-03-09
2     2020-03-16
3     2020-03-23
4     2020-03-30
         ...    
328   2020-06-29
329   2020-07-06
330   2020-07-13
331   2020-07-20
332   2020-07-27
Name: date, Length: 243, dtype: datetime64[ns]
