# Data cleaning
This notebook goes through some pretty typical grunt work of getting the data into a format that's close to useable for ML. You'll have to drop the date columns, but at the end everything is encoded as needed.

The only bit of trouble I had was deciding how to impute the missing dates for the first positive specimine and onset dates. There are a lot of blanks in those columns. After making a failed attempt at using KNN imputer I decided to go with a simpler route and just take the median days between CDC report and onset and positive sample... I'm not sure if this was a good decision or not, but it's the best guess I could come up with.

In [None]:
%matplotlib inline
import numpy as np 
import pandas as pd 
import regex as re
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import *
from sklearn.preprocessing import LabelEncoder
import gc
from sklearn.impute import KNNImputer
from pandas.api.types import is_string_dtype, is_numeric_dtype, is_categorical_dtype
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from datetime import timedelta

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# These helper and data cleaning functions are from the old fast.ai course
# The repository is here: https://github.com/fastai/fastai/tree/master/old
def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        display(df)
        
def make_date(df, date_field:str):
    "Make sure `df[field_name]` is of the right date type."
    field_dtype = df[date_field].dtype
    if isinstance(field_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
        field_dtype = np.datetime64
    if not np.issubdtype(field_dtype, np.datetime64):
        df[date_field] = pd.to_datetime(df[date_field], infer_datetime_format=True)
        

def add_datepart(df, fldnames, drop=True, time=False, errors="raise"):
    # add_datepart converts a column of df from a datetime64 to many columns containing the information from the date. 
    # This applies changes inplace.
    if isinstance(fldnames,str): 
        fldnames = [fldnames]
    for fldname in fldnames:
        fld = df[fldname]
        fld_dtype = fld.dtype
        if isinstance(fld_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
            fld_dtype = np.datetime64

        if not np.issubdtype(fld_dtype, np.datetime64):
            df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True, errors=errors)
        targ_pre = re.sub('[Dd]ate$', '', fldname)
        attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
                'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
        if time: attr = attr + ['Hour', 'Minute', 'Second']
        for n in attr: df[targ_pre + n] = getattr(fld.dt, n.lower())
        df[targ_pre + 'Elapsed'] = fld.astype(np.int64) // 10 ** 9
        if drop: df.drop(fldname, axis=1, inplace=True)
        
        
def ifnone (a,b): #(a:Any,b:Any)->Any:
    "`a` if `a` is not None, otherwise `b`."
    return b if a is None else a

# Function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=10, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)


def train_cats(df):    
    for n,c in df.items():
        if is_string_dtype(c): df[n] = c.astype('category').cat.as_ordered()

def apply_cats(df, trn):
    for n,c in df.items():
        if (n in trn.columns) and (trn[n].dtype.name=='category'):
            df[n] = c.astype('category').cat.as_ordered()
            df[n].cat.set_categories(trn[n].cat.categories, ordered=True, inplace=True)

def numericalize(df, col, name, max_n_cat):
    if not is_numeric_dtype(col) and ( max_n_cat is None or len(col.cat.categories)>max_n_cat):
        df[name] = pd.Categorical(col).codes+1

def rf_feat_importance(m, df):
    return pd.DataFrame({'cols':df.columns, 'imp':m.feature_importances_}
                       ).sort_values('imp', ascending=False)      

#
# End fast.ai funcitons...
#

# This function I believe came from this guy: https://www.kaggle.com/siavrez

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    #return df

In [None]:
df_raw = pd.read_csv('../input/covid19-case-surveillance-public-use-dataset/COVID-19_Case_Surveillance_Public_Use_Data.csv')
df_raw.shape

In [None]:
df_raw.tail()

In [None]:
make_date(df_raw, 'cdc_report_dt')
make_date(df_raw, 'pos_spec_dt')
make_date(df_raw, 'onset_dt')

In [None]:
df_processed = df_raw

In [None]:
add_datepart(df_processed, 'cdc_report_dt', drop=False)
df_processed.head()

In [None]:
cols_with_missing = df_processed.columns[df_processed.isnull().any()].tolist() #Get a list of all columns with null values

#Add a column and field marking that where null values were and that it was missing
for col in cols_with_missing: 
    df_processed[col + '_was_missing'] = df_processed[col].isnull()  
df_processed.head()

In [None]:
df_processed['pos_spec_dt'][df_processed.pos_spec_dt.notnull()]

In [None]:
# getting the difference in days between CDC report and onset and first positive specimen.
# making sure to leave missing values missing in the new column
# The new column will be used for KNN imputation below and therefore to guess at the missing data in the positive specimin and onset columns
df_processed['pos_difference'] = (df_processed['cdc_report_dt'] -df_processed['pos_spec_dt'][df_processed.pos_spec_dt.notnull()]).dt.days
df_processed['onset_difference'] = (df_processed['cdc_report_dt'] -df_processed['onset_dt'][df_processed.onset_dt.notnull()]).dt.days
df_processed.head()

In [None]:
cols_for_dummies = ['current_status', 'sex', 'age_group', 'Race and ethnicity (combined)', 'hosp_yn', 'icu_yn', 'death_yn', 'medcond_yn']
df_processed = pd.get_dummies(df_processed, columns=cols_for_dummies)
df_processed.shape
df_processed.head()

In [None]:
pd.options.display.max_rows = 100
pd.DataFrame.from_records([(col, df_processed[col].nunique(), df_processed[col].dtype) for col in df_processed.columns],
                          columns=['Column_Name', 'Num_Unique', 'Dtype']).sort_values(by=['Num_Unique'])

In [None]:
%who DataFrame

In [None]:
del df_raw
reduce_mem_usage(df_processed)
gc.collect()

<img src ="https://snappygoat.com/b/e4dcb09d666964fb88c7349ba21417f5952a287c" width = 250 align = right>

**To impute missing values from the date columns, I was trying to use KNN, but I couldn't get it to work, the process just kept spinning and wouldn't finish. I can get it to work with 1000 or so records, but I then tried to do 400k, I left it alone for a few hours and it still wasn't done. Next code blocks I do some EDA to figure out a simple imputation method that makes sense.**

If someone wants to mess around with KNN, here's the code I tried. I was just guessing at features and hyperparamaters to put in.
```
imputer = KNNImputer(n_neighbors=3)
imputer.fit_transform(
    df_processed[['cdc_report_dtWeek',
                  'cdc_report_dtDayofweek', 
                  'pos_difference','onset_difference',
                  'Race and ethnicity (combined)_White, Non-Hispanic', 
                  'hosp_yn_Yes'
    ]])
```

In [None]:
df_processed['pos_difference'].hist(bins=100, grid=False, xlabelsize=12, ylabelsize=12)
plt.xlabel("Days between Pos sample & CDC report", fontsize=15)
plt.ylabel("Frequency",fontsize=15)
#plt.xlim([-25,75])


In [None]:
df_processed['onset_difference'].hist(bins=100)
plt.xlabel("Days between Onset & CDC report", fontsize=15)
plt.ylabel("Frequency",fontsize=15)
#plt.xlim([-25,75])


### Most of the filled in dates seems to be normal/possion, that is around zero. But there are lots of outliers...

In [None]:
#df_processed['onset_difference'].argmax()
df_processed.nlargest(10, ['onset_difference'])

In [None]:
df_processed.nlargest(10, ['pos_difference'])

In [None]:
# getting the difference in days between CDC report and onset and first positive specimen.
# making sure to leave missing values missing in the new column
# The new column will be used for KNN imputation below and therefore to guess at the missing data in the positive specimin and onset columns
df_processed['onset_pos_difference'] = (df_processed['onset_dt'] -df_processed['pos_spec_dt'][df_processed.pos_spec_dt.notnull()]).dt.days

df_processed['onset_pos_difference'].hist(bins=100)
plt.xlabel("Days between Onset & Pos sample", fontsize=15)
plt.ylabel("Frequency",fontsize=15)
#plt.xlim([-25,75])

### To deal with the outliers let's take the median instead of the mean to calculate the dates for the missing values... As per below it's 2 days.

In [None]:
df_processed['onset_pos_difference'].mean(), df_processed['onset_pos_difference'].median(), df_processed['pos_difference'].mean(), df_processed['pos_difference'].median(),df_processed['onset_difference'].mean(), df_processed['pos_difference'].median()

In [None]:
df_processed.head()

In [None]:
df_processed['onset_dt'].fillna(df_processed.cdc_report_dt + timedelta(days=2), inplace = True)
df_processed['pos_spec_dt'].fillna(df_processed.cdc_report_dt + timedelta(days=2), inplace = True)
df_processed.tail()

In [None]:
add_datepart(df_processed, 'pos_spec_dt', drop=False)
add_datepart(df_processed, 'onset_dt', drop=False)

In [None]:
df_processed.head()

In [None]:
df_processed.info