# CDC_Preprocessing

Notebook to show the cdc preprocessing routines. The preprocessing is shown only for `national` region. However it is easily transferrable to 
other regions

**TABLE OF CONTENTS**

1. [Reading Raw Data](#Reading-Raw-Data)
2. [Scaling and Merging WHO_NREVSS/ILINet data](#Merging)
3. [Saving preprocessed data](#Saving)
4. [Historical CDC data](#Historical-CDC)

In [1]:
from codes.ewdate import EwDateConverter

import pandas as pd
import numpy as np

In [2]:
pd.__version__

'0.16.2'

# Reading Raw Data

In [None]:
ilinet = './data/cdc-ILINet-national-2015-05-25.csv'
whonrevss = './data/cdc-WHO_NREVSS-national-2015-05-25.csv'

In [None]:
to_date = EwDateConverter.convertToDate

# *****************************
# ILINet data
# *****************************
cdc_net = pd.read_csv(ilinet, skiprows=1)
cdc_net['date'] = pd.to_datetime(cdc_net[['YEAR', 'WEEK']]
                                 .apply(to_date, axis=1))
cdc_net.set_index('date', inplace=True)

# *****************************
# WHO_NREVSS data
# *****************************
cdc_who = pd.read_csv(whonrevss,  na_values=['X', 'N/A', '', None], 
                      na_fvalues=np.NAN)
cdc_who['date'] = pd.to_datetime(cdc_who[['YEAR', 'WEEK']]
                                 .apply(to_date, axis=1))
cdc_who.set_index('date', inplace=True)
FLUA = [u'A (H1)', u'A (Unable to Subtype)', 
        u'A (H3)', u'A (Subtyping not Performed)']
# type conversions and calculation of flu A
cdc_who[FLUA + ['B']].astype(float, inplace=True)
cdc_who['FLUA'] = cdc_who[FLUA].sum(axis=1)

## Merging

Merges `ILINet` and `WHO_NREVSS` data. Also calculates strain percentages scaled to ILINet using WHO_NREVSS data

In [None]:
# **************************************************************
#                   MANIPULATORS
# **************************************************************
# Get ratios
def get_ratios(X, col1='FLUA', col2='B', epsilon=1, suffix='_per'):
    """ lambda funtion to get ratios of col1 and col2 as percentage.
    """
    denom = X[col1] + X[col2] + epsilon
    num1 = ((X[col1] + epsilon)/ denom).fillna(0)
    num2 = ((X[col2] + epsilon)/ denom).fillna(0)
    return pd.DataFrame({col1+suffix: num1,
                         col2+suffix: num2})


# Get ILINET values
def get_values(X):
    """ lambda funtion to get ILINET VALUES
    """
    
    return (np.round(X['FLUA_per'] * X['ILITOTAL']),
            np.round(X['B_per'] * X['ILITOTAL']))

In [None]:
# ***************************************************************
#                   Scaling of WHO data to ILINet Scale
# ***************************************************************
combined_who_columns = [u'PERCENT POSITIVE', u'B',
                        u'FLUA', u'FLUA_per', u'B_per']
# calculating ratios of strains
cdc_who[['FLUA_per', 'B_per']] = (get_ratios(cdc_who, epsilon=0)
                                  [['FLUA_per', 'B_per']])
# merging frames
combined_df = (cdc_net.join(cdc_who[combined_who_columns]))['2004':]
# Scaling ILINet according to strain ratios
combined_df['ILI_FLUA'], combined_df['ILI_FLUB'] = zip(*combined_df.apply(get_values, axis=1)) 

In [None]:
combined_df.ix[0]

# Season weeks

computing season weeks according to CDC definition

> Season starts from EW $= 40$ and ends at EW $=39$

In [None]:
def find_season(row):
    """Function to find seasons and corresponding epi weeks.
    
    TODO: Expand to class
    """
    year = row['YEAR']
    ew = row['WEEK']
    try:
        if row['WEEK'] >= 40:
            season = year + 1
            season_week = ew - 40 + 1
        else:
            season = year
            season_week = (to_date((year, ew)) 
                           -to_date((year - 1, 40))).days / 7
    except Exception as e:
        print row
        raise Exception(e)
    return season, season_week   

In [None]:
combined_df['season'], combined_df['season_week'] = zip(*combined_df.apply(find_season, axis=1))

# Saving

In [None]:
combined_df.to_csv('./data/cdc-combined-national-2015-05-25.csv')

# Historical CDC

**CDC historical data : ** Read the data and append date and date_reported.

In [10]:
def find_date(row, season_name='season', week_name='Week'):
    """Function to find date/year from season and calendar Week.
    
    TODO: Expand to class
    """
    season = np.int(row[season_name])
    week = np.int(row[week_name])
    try:
        if week >= 40:
            year = season - 1
            season_week = week - 40 + 1
        else:
            year = season
            season_week = (EwDateConverter.convertToDate((year, week)) 
                           - EwDateConverter.convertToDate((year - 1, 40))).days // 7
            
        dt = EwDateConverter.convertToDate((year, week))
    except Exception as e:
        print row
        raise Exception(e)
    return dt, season_week    

In [11]:
historical_cdc = pd.read_csv('./data/cdc-historical-2010-2015.csv', 
                             na_values=['x'], 
                             dtype={'season': np.int, 'Week': np.int,
                                   'week_reported': np.int})


historical_cdc['date'], historical_cdc['date_week'] = zip(*historical_cdc.apply(find_date, axis=1))

get_reported_date = lambda x: find_date(x, week_name='week_reported')
historical_cdc['date_reported'], \
  historical_cdc['date_reported_week'] = zip(*historical_cdc.apply(get_reported_date,
                                                                                                  axis=1))
historical_cdc.sort(['date', 'date_reported']).to_csv('./data/cdc-historical-2010-2015.csv', index=False)

In [12]:
historical_cdc.head()

Unnamed: 0,week_reported,season,Week,age0to4,age5to24,age25to49,age50to64,age64plus,TotalILI,TotalPatients,UnweightedILI,WeightedILI,date,date_week,date_reported,date_reported_week
0,40,2010,40,6654,19270,5715.0,1417,620,33676,587010,5.737,6.055,2009-10-04,1,2009-10-04,1
1,41,2010,40,7799,23062,,6991,1882,756,40490,699438.0,5.789,2009-10-04,1,2009-10-11,2
2,41,2010,41,8225,23938,,7673,2004,719,42559,627079.0,6.787,2009-10-11,2,2009-10-11,2
3,42,2010,40,8163,24065,,7368,1962,795,42353,736552.0,5.75,2009-10-04,1,2009-10-18,3
4,42,2010,41,9848,27678,,8730,2356,915,49527,725565.0,6.826,2009-10-11,2,2009-10-18,3


In [None]:
%debug

In [None]:
!head -30 ./data/cdc-historical-2010-2015.csv