# Pre-processing Data

## Load Libs

In [1]:
import os

# data
import pandas as pd
from sklearn.linear_model import LinearRegression
import numpy as np 

# plotting
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as mcolors
import seaborn as sns
from plotly.subplots import make_subplots
import plotly.graph_objs as go
import plotly.express as px

# stats
import scipy.stats as stats
from scipy.stats import expon, norm, kstest
from scipy.stats import chisquare

## Load data

In [2]:
df_flu = pd.read_excel("./data/Data_Flu.xlsx")
df_rsv = pd.read_excel("./data/Data_RSV.xlsx")
df_sars = pd.read_excel("./data/Data_SARCoV.xlsx")

# Feature Processing

## FluA

In the following sections we export the rna copies original data and the moving average data.

In [3]:
df_flu['date'] = pd.to_datetime(df_flu['yyyy-w (Flu)'] + '-1', format='%Y_%W-%w')

In [4]:
mov_fluA_cols = [col for col in df_flu.columns if col.startswith('Mov-FluA')]

for col in mov_fluA_cols:
    label = col.replace('Mov-', '')
    local_df = pd.DataFrame({
        'time': df_flu['date'],
        'rna_copies': df_flu[col],
        'label': "FluA"
    })
    local_df = local_df.dropna(subset=['rna_copies'])
    local_df.to_csv(f'./data_preprocess/{col}.csv', index=False)

In [5]:
fluA_cols = [col for col in df_flu.columns if col.startswith('FluA')]

for col in fluA_cols:
    local_df = pd.DataFrame({
        'time': df_flu['date'],
        'rna_copies': df_flu[col],
        'label': "FluA"
    })
    local_df = local_df.dropna(subset=['rna_copies'])
    local_df.to_csv(f'./data_preprocess/{col}.csv', index=False)

## FluB

In [6]:
mov_fluB_cols = [col for col in df_flu.columns if col.startswith('Mov-FluB')]

for col in mov_fluB_cols:
    label = col.replace('Mov-', '')
    local_df = pd.DataFrame({
        'time': df_flu['date'],
        'rna_copies': df_flu[col],
        'label': "FluB"
    })
    local_df = local_df.dropna(subset=['rna_copies'])
    local_df.to_csv(f'./data_preprocess/{col}.csv', index=False)

In [7]:
fluB_cols = [col for col in df_flu.columns if col.startswith('FluB')]

for col in fluB_cols:
    local_df = pd.DataFrame({
        'time': df_flu['date'],
        'rna_copies': df_flu[col],
        'label': "FluB"
    })
    local_df = local_df.dropna(subset=['rna_copies'])
    local_df.to_csv(f'./data_preprocess/{col}.csv', index=False)

## RSV

In [8]:
df_rsv['date'] = pd.to_datetime(df_rsv['yyyy-w (RSV)'] + '-1', format='%Y_%W-%w')

In [9]:
mov_rsv_cols = [col for col in df_rsv.columns if col.startswith('Mov-RSV')]

for col in mov_rsv_cols:
    label = col.replace('Mov-', '')
    local_df = pd.DataFrame({
        'time': df_rsv['date'],
        'rna_copies': df_rsv[col],
        'label': "RSV"
    })
    local_df = local_df.dropna(subset=['rna_copies'])
    local_df.to_csv(f'./data_preprocess/{col}.csv', index=False)

In [10]:
rsv_cols = [col for col in df_rsv.columns if col.startswith('RSV')]

for col in rsv_cols:
    local_df = pd.DataFrame({
        'time': df_rsv['date'],
        'rna_copies': df_rsv[col],
        'label': "RSV"
    })
    local_df = local_df.dropna(subset=['rna_copies'])
    local_df.to_csv(f'./data_preprocess/{col}.csv', index=False)

## SARS-CoV-2

In [11]:
df_sars['date'] = pd.to_datetime(df_sars['yyyy-w (SARS-CoV)'] + '-1', format='%Y_%W-%w')

In [12]:
mov_sars_cols = [col for col in df_sars.columns if col.startswith('Mov-SARS-CoV')]

for col in mov_sars_cols:
    label = col.replace('Mov-', '')
    local_df = pd.DataFrame({
        'time': df_sars['date'],
        'rna_copies': df_sars[col],
        'label': "SARS-CoV-2"
    })
    local_df = local_df.dropna(subset=['rna_copies'])
    local_df.to_csv(f'./data_preprocess/{col}.csv', index=False)

In [13]:
sars_cols = [col for col in df_sars.columns if col.startswith('SARS-CoV')]

for col in sars_cols:
    local_df = pd.DataFrame({
        'time': df_sars['date'],
        'rna_copies': df_sars[col],
        'label': "SARS-CoV-2"
    })
    local_df = local_df.dropna(subset=['rna_copies'])
    local_df.to_csv(f'./data_preprocess/{col}.csv', index=False)

## Slope or gradient feature construction

In this section we take linear regression of n peevious data points and build a new feature that will give the slope of the points. We can take multiple time windows and build mutliple features like the features build for lag.

In [14]:
def compute_slope_feature(series: pd.Series, window: int = 3) -> pd.Series:
    # Forward fill missing values before slope calculation
    series_filled = series.ffill().bfill()  # fill forward, then backward for start NaNs
    
    slopes = [np.nan] * (window - 1)  # First (window-1) entries are NaN
    
    for i in range(window - 1, len(series_filled)):
        window_values = series_filled.iloc[i - window + 1: i + 1]
        y = window_values.values.reshape(-1, 1)
        x = np.arange(window).reshape(-1, 1)
        model = LinearRegression().fit(x, y)
        slopes.append(model.coef_[0][0])  # raw slope
        
    slope_series = pd.Series(slopes, index=series.index)
    
    # Normalize slope_series between -1 and 1 using mean slope value
    mean_slope = slope_series.mean(skipna=True)
    
    # Center around mean
    centered = slope_series - mean_slope
    
    # Get max absolute deviation for scaling
    max_dev = centered.abs().max()
    
    # Avoid division by zero if all values equal mean
    if max_dev == 0 or np.isnan(max_dev):
        normalized = centered * 0
    else:
        normalized = centered / max_dev
    
    return normalized


In [15]:
# window_sizes = [2, 3, 4, 5, 7]
window_sizes = [4, 5, 7]

### flu A

In [16]:
for col in fluA_cols:
    for window in window_sizes:
        slope_col = f"{col}_slope_{window}"
        df_flu[slope_col] = compute_slope_feature(df_flu[col], window=window)

df_flu.head()

Unnamed: 0,yyyy-w (Flu),FluA-Nat,FluB-Nat,Mov-FluA-Nat,Mov-FluB-Nat,FluA-BEG,FluB-BEG,Mov-FluA-BEG,Mov-FluB-BEG,FluA-BET,...,FluA-GRE_slope_7,FluA-HES_slope_4,FluA-HES_slope_5,FluA-HES_slope_7,FluA-VIE_slope_4,FluA-VIE_slope_5,FluA-VIE_slope_7,FluA-WIL_slope_4,FluA-WIL_slope_5,FluA-WIL_slope_7
0,2021_40,0.0,0.0,,,0.0,0.0,,,0.0,...,,,,,,,,,,
1,2021_41,0.0,0.0,,,0.0,0.0,,,0.0,...,,,,,,,,,,
2,2021_42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
3,2021_43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,0.0,,,0.0,,,0.0,,
4,2021_44,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,0.0,0.0,,0.0,0.0,,0.0,0.0,


In [17]:
plot_df = df_flu[['FluA-Nat', 
                #   'FluA-Nat_slope_2', 
                #   'FluA-Nat_slope_3', 
                  'FluA-Nat_slope_4', 
                  'FluA-Nat_slope_5', 
                  'FluA-Nat_slope_7']].copy()
plot_df['index'] = plot_df.index  # Needed for x-axis

# Separate the slope columns and the original value column
slope_cols = [
    # 'FluA-Nat_slope_2', 
    # 'FluA-Nat_slope_3', 
    'FluA-Nat_slope_4', 
    'FluA-Nat_slope_5', 
    'FluA-Nat_slope_7']

fig = go.Figure()

# Add FluA-Nat line on primary y-axis
fig.add_trace(go.Scatter(
    x=plot_df['index'],
    y=plot_df['FluA-Nat'],
    mode='lines+markers',
    name='FluA-Nat',
    marker=dict(size=6, color='black', symbol='circle-open'),
    yaxis='y1'
))

# Add slope lines on secondary y-axis
for col in slope_cols:
    fig.add_trace(go.Scatter(
        x=plot_df['index'],
        y=plot_df[col],
        mode='lines',
        name=col,
        yaxis='y2'
    ))

fig.update_layout(
    title='FluA-Nat and Slope Features (Window 2,3,4,5,7)',
    xaxis=dict(title='Time Index'),
    yaxis=dict(
        title='RNA Copies',
        side='left',
        showgrid=True
    ),
    yaxis2=dict(
        title='Normalized Slope',
        overlaying='y',
        side='right',
        range=[-1, 1],
        showgrid=False,
        zeroline=True
    ),
    legend=dict(x=0.01, y=0.99),
    hovermode='x unified',
    width=900,
    height=500
)

fig.show()

#### save the flu data

In [19]:
fluA_slope_cols = [col for col in df_flu.columns if col.__contains__('slope')]

for col in fluA_slope_cols:
    # print(col)
    local_df = pd.DataFrame({
        'time': df_flu['date'],
        'rna_copies': df_flu[col],
        'label': "FluA"
    })
    local_df = local_df.dropna(subset=['rna_copies'])
    local_df.to_csv(f'./data_preprocess/{col}.csv', index=False)

### flu B

In [20]:
for col in fluB_cols:
    for window in window_sizes:
        slope_col = f"{col}_slope_{window}"
        df_flu[slope_col] = compute_slope_feature(df_flu[col], window=window)

df_flu.head()

Unnamed: 0,yyyy-w (Flu),FluA-Nat,FluB-Nat,Mov-FluA-Nat,Mov-FluB-Nat,FluA-BEG,FluB-BEG,Mov-FluA-BEG,Mov-FluB-BEG,FluA-BET,...,FluB-GRE_slope_7,FluB-HES_slope_4,FluB-HES_slope_5,FluB-HES_slope_7,FluB-VIE_slope_4,FluB-VIE_slope_5,FluB-VIE_slope_7,FluB-WIL_slope_4,FluB-WIL_slope_5,FluB-WIL_slope_7
0,2021_40,0.0,0.0,,,0.0,0.0,,,0.0,...,,,,,,,,,,
1,2021_41,0.0,0.0,,,0.0,0.0,,,0.0,...,,,,,,,,,,
2,2021_42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
3,2021_43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,0.0,,,0.0,,,0.0,,
4,2021_44,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,0.0,0.0,,0.0,0.0,,0.0,0.0,


In [21]:
plot_df = df_flu[['FluB-Nat', 
                #   'FluB-Nat_slope_2', 
                #   'FluB-Nat_slope_3', 
                  'FluB-Nat_slope_4', 
                  'FluB-Nat_slope_5', 
                  'FluB-Nat_slope_7']].copy()
plot_df['index'] = plot_df.index  # Needed for x-axis

# Separate the slope columns and the original value column
slope_cols = [
    # 'FluB-Nat_slope_2', 
    # 'FluB-Nat_slope_3', 
    'FluB-Nat_slope_4', 
    'FluB-Nat_slope_5', 
    'FluB-Nat_slope_7']

fig = go.Figure()

# Add FluB-Nat line on primary y-axis
fig.add_trace(go.Scatter(
    x=plot_df['index'],
    y=plot_df['FluB-Nat'],
    mode='lines+markers',
    name='FluB-Nat',
    marker=dict(size=6, color='black', symbol='circle-open'),
    yaxis='y1'
))

# Add slope lines on secondary y-axis
for col in slope_cols:
    fig.add_trace(go.Scatter(
        x=plot_df['index'],
        y=plot_df[col],
        mode='lines',
        name=col,
        yaxis='y2'
    ))

fig.update_layout(
    title='FluB-Nat and Slope Features (Window 2,3,4,5,7)',
    xaxis=dict(title='Time Index'),
    yaxis=dict(
        title='RNA Copies',
        side='left',
        showgrid=True
    ),
    yaxis2=dict(
        title='Normalized Slope',
        overlaying='y',
        side='right',
        range=[-1, 1],
        showgrid=False,
        zeroline=True
    ),
    legend=dict(x=0.01, y=0.99),
    hovermode='x unified',
    width=900,
    height=500
)

fig.show()

#### Save the fluB slope features

In [22]:
fluB_slope_cols = [col for col in df_flu.columns if col.__contains__('slope')]

for col in fluB_slope_cols:
    # print(col)
    local_df = pd.DataFrame({
        'time': df_flu['date'],
        'rna_copies': df_flu[col],
        'label': "FluA"
    })
    local_df = local_df.dropna(subset=['rna_copies'])
    local_df.to_csv(f'./data_preprocess/{col}.csv', index=False)

### RSV

In [23]:
for col in rsv_cols:
    for window in window_sizes:
        slope_col = f"{col}_slope_{window}"
        df_rsv[slope_col] = compute_slope_feature(df_rsv[col], window=window)

df_rsv.head()

Unnamed: 0,yyyy-w (RSV),RSV-Nat,Mov-RSV-Nat,RSV-BEG,Mov-RSV-BEG,RSV-BET,Mov-RSV-BET,RSV-PET,Mov-RSV-PET,RSV-SCH,...,RSV-GRE_slope_7,RSV-HES_slope_4,RSV-HES_slope_5,RSV-HES_slope_7,RSV-VIE_slope_4,RSV-VIE_slope_5,RSV-VIE_slope_7,RSV-WIL_slope_4,RSV-WIL_slope_5,RSV-WIL_slope_7
0,2023_23,19302490000.0,,0.0,,37851620000.0,,70043740000.0,,0.0,...,,,,,,,,,,
1,2023_24,0.0,,0.0,,0.0,,0.0,,0.0,...,,,,,,,,,,
2,2023_25,17910460000.0,12404320000.0,0.0,0.0,,18925810000.0,69446340000.0,46496690000.0,70270640000.0,...,,,,,,,,,,
3,2023_26,10731610000.0,9547357000.0,0.0,0.0,0.0,0.0,0.0,23148780000.0,50547040000.0,...,,0.0,,,0.0,,,0.0,,
4,2023_27,9305754000.0,12649270000.0,0.0,0.0,55717350000.0,27858670000.0,0.0,23148780000.0,0.0,...,,0.0,0.0,,0.0,0.0,,0.0,0.0,


In [24]:
rsv_slope_cols = [col for col in df_rsv.columns if col.__contains__('slope')]

for col in rsv_slope_cols:
    # print(col)
    local_df = pd.DataFrame({
        'time': df_rsv['date'],
        'rna_copies': df_rsv[col],
        'label': "FluA"
    })
    local_df = local_df.dropna(subset=['rna_copies'])
    local_df.to_csv(f'./data_preprocess/{col}.csv', index=False)

### SARS

In [None]:
for col in sars_cols:
    for window in window_sizes:
        slope_col = f"{col}_slope_{window}"
        df_sars[slope_col] = compute_slope_feature(df_sars[col], window=window)

df_sars.head()

In [None]:
sars_slope_cols = [col for col in df_sars.columns if col.__contains__('slope')]

for col in sars_slope_cols:
    # print(col)
    local_df = pd.DataFrame({
        'time': df_sars['date'],
        'rna_copies': df_sars[col],
        'label': "FluA"
    })
    local_df = local_df.dropna(subset=['rna_copies'])
    local_df.to_csv(f'./data_preprocess/{col}.csv', index=False)