# Pre-processing Data

## Load Libs

In [1]:
import os

# data
import pandas as pd
import numpy as np 

# plotting
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as mcolors
import seaborn as sns
from plotly.subplots import make_subplots
import plotly.graph_objs as go

# stats
import scipy.stats as stats
from scipy.stats import expon, norm, kstest
from scipy.stats import chisquare

## Load data

In [2]:
df_flu = pd.read_excel("./data/Data_Flu.xlsx")
df_rsv = pd.read_excel("./data/Data_RSV.xlsx")
df_sars = pd.read_excel("./data/Data_SARCoV.xlsx")

# Processing

## FluA

In the following sections we export the rna copies original data and the moving average data.

In [3]:
df_flu['date'] = pd.to_datetime(df_flu['yyyy-w (Flu)'] + '-1', format='%Y_%W-%w')

In [4]:
mov_fluA_cols = [col for col in df_flu.columns if col.startswith('Mov-FluA')]

for col in mov_fluA_cols:
    label = col.replace('Mov-', '')
    local_df = pd.DataFrame({
        'time': df_flu['date'],
        'rna_copies': df_flu[col],
        'label': "FluA"
    })
    local_df = local_df.dropna(subset=['rna_copies'])
    local_df.to_csv(f'./data_preprocess/{col}.csv', index=False)

In [5]:
fluA_cols = [col for col in df_flu.columns if col.startswith('FluA')]

for col in fluA_cols:
    local_df = pd.DataFrame({
        'time': df_flu['date'],
        'rna_copies': df_flu[col],
        'label': "FluA"
    })
    local_df = local_df.dropna(subset=['rna_copies'])
    local_df.to_csv(f'./data_preprocess/{col}.csv', index=False)

## FluB

In [6]:
mov_fluB_cols = [col for col in df_flu.columns if col.startswith('Mov-FluB')]

for col in mov_fluB_cols:
    label = col.replace('Mov-', '')
    local_df = pd.DataFrame({
        'time': df_flu['date'],
        'rna_copies': df_flu[col],
        'label': "FluB"
    })
    local_df = local_df.dropna(subset=['rna_copies'])
    local_df.to_csv(f'./data_preprocess/{col}.csv', index=False)

In [7]:
fluB_cols = [col for col in df_flu.columns if col.startswith('FluB')]

for col in fluB_cols:
    local_df = pd.DataFrame({
        'time': df_flu['date'],
        'rna_copies': df_flu[col],
        'label': "FluB"
    })
    local_df = local_df.dropna(subset=['rna_copies'])
    local_df.to_csv(f'./data_preprocess/{col}.csv', index=False)

## RSV

In [8]:
df_rsv['date'] = pd.to_datetime(df_rsv['yyyy-w (RSV)'] + '-1', format='%Y_%W-%w')

In [9]:
mov_rsv_cols = [col for col in df_rsv.columns if col.startswith('Mov-RSV')]

for col in mov_rsv_cols:
    label = col.replace('Mov-', '')
    local_df = pd.DataFrame({
        'time': df_rsv['date'],
        'rna_copies': df_rsv[col],
        'label': "RSV"
    })
    local_df = local_df.dropna(subset=['rna_copies'])
    local_df.to_csv(f'./data_preprocess/{col}.csv', index=False)

In [10]:
rsv_cols = [col for col in df_rsv.columns if col.startswith('RSV')]

for col in rsv_cols:
    local_df = pd.DataFrame({
        'time': df_rsv['date'],
        'rna_copies': df_rsv[col],
        'label': "RSV"
    })
    local_df = local_df.dropna(subset=['rna_copies'])
    local_df.to_csv(f'./data_preprocess/{col}.csv', index=False)

## SARS-CoV-2

In [11]:
df_sars['date'] = pd.to_datetime(df_sars['yyyy-w (SARS-CoV)'] + '-1', format='%Y_%W-%w')

In [None]:
mov_sars_cols = [col for col in df_sars.columns if col.startswith('Mov-SARS-CoV')]

for col in mov_sars_cols:
    label = col.replace('Mov-', '')
    local_df = pd.DataFrame({
        'time': df_sars['date'],
        'rna_copies': df_sars[col],
        'label': "SARS-CoV-2"
    })
    local_df = local_df.dropna(subset=['rna_copies'])
    local_df.to_csv(f'./data_preprocess/{col}.csv', index=False)

In [12]:
sars_cols = [col for col in df_sars.columns if col.startswith('SARS-CoV')]

for col in sars_cols:
    local_df = pd.DataFrame({
        'time': df_sars['date'],
        'rna_copies': df_sars[col],
        'label': "SARS-CoV-2"
    })
    local_df = local_df.dropna(subset=['rna_copies'])
    local_df.to_csv(f'./data_preprocess/{col}.csv', index=False)