# Outlier detection

In this notebook we do an outlier detection as described in https://www.frontiersin.org/journals/public-health/articles/10.3389/fpubh.2024.1497100/full

## Load Libs

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import median_abs_deviation
import os
from glob import glob

import plotly.graph_objects as go
import plotly.express as px

## Load data

In [None]:
# Set the path where CSVs are located
labeled_data_path = 'data_labelled' 
labled_csv_files = glob(os.path.join(labeled_data_path, 'Mov-SARS*.csv'))

orig_data_path = 'data_preprocess'
orig_csv_files = glob(os.path.join(orig_data_path, 'SARS*.csv'))

# Load and concatenate all CSVs
dfs = [pd.read_csv(f) for f in labled_csv_files]
df = pd.concat(dfs, ignore_index=True)

# dfs_orig = [pd.read_csv(f) for f in orig_csv_files]
dfs_orig = []
for file in orig_csv_files:
    local_df = pd.read_csv(file)
    filename = file.replace('data_preprocess\\', '')
    local_df['filename'] = filename

    dfs_orig.append(local_df)
    
df_orig = pd.concat(dfs_orig, ignore_index=True)

# Preview
df.head()


Unnamed: 0,time,rna_copies,label,filename,virus
0,2021-10-18,0.0,none,Mov-FluA-BEG.csv,FLUA
1,2021-10-25,0.0,none,Mov-FluA-BEG.csv,FLUA
2,2021-11-01,0.0,none,Mov-FluA-BEG.csv,FLUA
3,2021-11-08,0.0,none,Mov-FluA-BEG.csv,FLUA
4,2021-11-15,0.0,none,Mov-FluA-BEG.csv,FLUA


In [3]:
df_orig.head()

Unnamed: 0,time,rna_copies,label,filename
0,2021-10-04,0.0,FluA,FluA-BEG.csv
1,2021-10-11,0.0,FluA,FluA-BEG.csv
2,2021-10-18,0.0,FluA,FluA-BEG.csv
3,2021-10-25,0.0,FluA,FluA-BEG.csv
4,2021-11-01,0.0,FluA,FluA-BEG.csv


## Calculate Outliers Using the Interquartile Range (IQR) Method (from article)

For each QCP:

- Compute the first quartile (Q1), third quartile (Q3)
- Calculate IQR = Q3 - Q1
- Define outliers as points outside 1.5 * IQR

In [16]:
def detect_outliers_modified_z(df, value_col='value', window=5, threshold=3.5):
    """
    Apply modified z-score method using rolling median and MAD.
    Adds a column 'outlier' to the dataframe.
    """
    df = df.copy()
    df['median'] = df[value_col].rolling(window, center=True).median()
    df['mad'] = df[value_col].rolling(window, center=True).apply(
        lambda x: median_abs_deviation(x, scale='normal'), raw=False
    )

    df['mod_z'] = 0.6745 * (df[value_col] - df['median']) / df['mad']
    df['outlier'] = df['mod_z'].abs() > threshold

    return df

def identify_outliers(column):
    # Only use non-zero values for stats
    non_zero = column[column != 0]
    
    if non_zero.empty:
        # All values are 0; return all False
        return pd.Series(False, index=column.index)

    median = non_zero.median()
    normalized = non_zero / median if median != 0 else non_zero.copy()
    
    q1 = normalized.quantile(0.2)
    q3 = normalized.quantile(0.8)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    
    outlier_mask_partial = (normalized < lower_bound) | (normalized > upper_bound)
    
    # Build full mask with same index as original column
    outlier_mask = pd.Series(False, index=column.index)
    outlier_mask.loc[outlier_mask_partial.index] = outlier_mask_partial
    
    return outlier_mask

In [17]:
df_outliers = df_orig.copy()
df_outliers['outlier'] = False  

# Group by 'filename' and apply the outlier function per group
for f in df_outliers['filename'].unique():
    mask = df_outliers['filename'] == f
    outlier_mask = identify_outliers(df_outliers.loc[mask, 'rna_copies'])
    df_outliers.loc[mask, 'outlier'] = outlier_mask

In [6]:
# # Apply the outlier detection
# df_outliers = detect_outliers_modified_z(df_orig, value_col='rna_copies', window=5, threshold=3.5)

# # Preview results
# df_outliers[['time', 'rna_copies', 'median', 'mad', 'mod_z', 'outlier']].head(10)

In [11]:
df_outliers.head()

Unnamed: 0,time,rna_copies,label,filename,outlier
0,2021-10-04,0.0,FluA,FluA-BEG.csv,False
1,2021-10-11,0.0,FluA,FluA-BEG.csv,False
2,2021-10-18,0.0,FluA,FluA-BEG.csv,False
3,2021-10-25,0.0,FluA,FluA-BEG.csv,False
4,2021-11-01,0.0,FluA,FluA-BEG.csv,False


## Plot

In [19]:
fig = go.Figure()

# Unique files
files = df_outliers['filename'].unique()

# Color palette
colors = px.colors.qualitative.Set1

df_outliers_nat = df_outliers.copy() 
# df_outliers_nat = df_outliers[df_outliers['filename'].str.contains('Nat')]

# Plot each file separately
for i, f in enumerate(files):
    df_file = df_outliers_nat[df_outliers_nat['filename'] == f]

    # Raw data line
    fig.add_trace(go.Scatter(
        x=df_file['time'], y=df_file['rna_copies'],
        mode='lines',
        name=f'{f} - RNA',
        line=dict(color=colors[i % len(colors)]),
        hoverinfo='name+y'
    ))

    # # Rolling median
    # fig.add_trace(go.Scatter(
    #     x=df_file['time'], y=df_file['median'],
    #     mode='lines',
    #     name=f'{f} - Median',
    #     line=dict(color='gray', dash='dash'),
    #     showlegend=False,
    #     hoverinfo='name+y'
    # ))

    # # Outliers
    # df_outliers_file = df_file[df_file['outlier']]
    # fig.add_trace(go.Scatter(
    #     x = df_outliers_file['time'], y = df_outliers_file['rna_copies'],
    #     mode='markers',
    #     name=f'{f} - Outlier',
    #     marker=dict(color='red', size=8, symbol='x'),
    #     hoverinfo='name+y'
    # ))
    df_outliers_file = df_file[df_file['outlier']]
    if not df_outliers_file.empty:
        fig.add_trace(go.Scatter(
            x=df_outliers_file['time'],
            y=df_outliers_file['rna_copies'],
            mode='markers',
            name=f'{f} - Outlier',
            marker=dict(color='red', size=8, symbol='x'),
            hoverinfo='name+y'
        ))

# Layout settings
fig.update_layout(
    title='Outlier Detection',
    xaxis_title='Time',
    yaxis_title='RNA copies / 100,000',
    legend_title='Data Source',
    height=600,
    template='plotly_white'
)

fig.show()