# Data Labeling

## Load libs

In [1]:
import os
import json
import glob
import pandas as pd
import numpy as np
from datetime import datetime

import matplotlib.pyplot as plt
import seaborn as sns

## Load data and labels

In [2]:
# Paths
json_path = './data_labelled/project-3-at-2025-06-30-17-49-c0b2a0a1.json'      # exported JSON from Label Studio
csv_folder = './data_preprocess'                # Folder with original CSVs

# Load label JSON
with open(json_path, 'r') as f:
    label_data = json.load(f)

def extract_filename(task):
    """Extracts clean filename from a Label Studio task entry"""
    full_filename = os.path.basename(task['csv'])  # e.g. "454c4a18-Mov-FluA-BEG.csv"
    mov_idx = full_filename.find('Mov-')
    if mov_idx >= 0:
        return full_filename[mov_idx:]  # e.g. "Mov-FluA-BEG.csv"
    return full_filename

def filter_tasks_by_virus(tasks, virus_keyword):
    filtered = []
    for task in tasks:
        filename = extract_filename(task).lower()
        if virus_keyword.lower() in filename:
            filtered.append(task)
    return filtered

def assign_labels_to_df(df, labels):
    """Assign label intervals to time-series DataFrame"""
    df['label'] = None
    df['time'] = pd.to_datetime(df['time'])

    for lab in labels:
        start = pd.to_datetime(lab['start'])
        end = pd.to_datetime(lab['end'])
        label_name = lab['timeserieslabels'][0]
        mask = (df['time'] >= start) & (df['time'] < end)
        df.loc[mask, 'label'] = label_name

    return df

def find_related_files(task_filename, virus_keyword):
    """
    Given a Label Studio task filename like 'Mov-FluA-BEG.csv',
    return all related files: 'Mov-FluA*.csv', 'FluA*.csv', 'FluA-*_slope*.csv'
    """
    base_pattern = virus_keyword + '*' + '.csv'
    slope_pattern = virus_keyword + '-*_slope*.csv'
    mov_pattern = 'Mov-' + virus_keyword + '*.csv'

    patterns = [base_pattern, slope_pattern, mov_pattern]
    related_files = set()

    for pattern in patterns:
        matching = glob.glob(os.path.join(csv_folder, pattern))
        related_files.update(matching)

    return list(related_files)


# For Different virus and respective features

## FluA

In [3]:
virus_name = "FluA"  
tasks = filter_tasks_by_virus(label_data, virus_name)

dfs = []

for task in tasks:
    labels = task.get('label', [])
    task_filename = extract_filename(task)  # e.g. Mov-FluA-BEG.csv
    related_files = find_related_files(task_filename, virus_name)

    for file_path in related_files:
        df = pd.read_csv(file_path)
        df['filename'] = os.path.basename(file_path)
        df['virus'] = virus_name.upper()
        df = assign_labels_to_df(df, labels)
        dfs.append(df)

# Combine all processed data
all_data_fluA = pd.concat(dfs, ignore_index=True)
all_data_fluA['label'] = all_data_fluA['label'].fillna('none')


In [4]:
for filename in all_data_fluA['filename'].unique():
    df_local = all_data_fluA[all_data_fluA['filename'] == filename]
    df_local.to_csv(f'./feature_store/{filename}', index=False)
    print(f'Saved {filename} with {len(df_local)} rows')


Saved FluA-BET_slope_5.csv with 1710 rows
Saved FluA-BEG_slope_7.csv with 1692 rows
Saved FluA-GRE_slope_5.csv with 1710 rows
Saved FluA-BEG_slope_5.csv with 1710 rows
Saved Mov-FluA-BOE.csv with 9 rows
Saved Mov-FluA-BET.csv with 1728 rows
Saved FluA-HES_slope_7.csv with 1692 rows
Saved FluA-PET_slope_4.csv with 1719 rows
Saved FluA-BLE_slope_7.csv with 1692 rows
Saved FluA-PET_slope_7.csv with 1692 rows
Saved FluA-BET_slope_7.csv with 1692 rows
Saved FluA-SCH_slope_7.csv with 1692 rows
Saved FluA-HES_slope_4.csv with 1719 rows
Saved FluA-SCH_slope_4.csv with 1719 rows
Saved FluA-MER.csv with 315 rows
Saved Mov-FluA-Nat.csv with 1728 rows
Saved FluA-ECH_slope_7.csv with 1692 rows
Saved FluA-PET.csv with 1674 rows
Saved FluA-GRE_slope_7.csv with 1692 rows
Saved FluA-WIL.csv with 9 rows
Saved Mov-FluA-MER.csv with 324 rows
Saved Mov-FluA-BEG.csv with 1728 rows
Saved FluA-BOE_slope_7.csv with 1692 rows
Saved FluA-HES_slope_5.csv with 1710 rows
Saved FluA-UEB_slope_5.csv with 1710 rows
Sa

clean

In [None]:
del df_local
del virus_name

## FluB

In [None]:
virus_name = "FluB"  
tasks = filter_tasks_by_virus(label_data, virus_name)

dfs = []

for task in tasks:
    labels = task.get('label', [])
    task_filename = extract_filename(task)  # e.g. Mov-FluB-BEG.csv
    related_files = find_related_files(task_filename, virus_name)

    for file_path in related_files:
        df = pd.read_csv(file_path)
        df['filename'] = os.path.basename(file_path)
        df['virus'] = virus_name.upper()
        df = assign_labels_to_df(df, labels)
        dfs.append(df)

# Combine all processed data
all_data_fluB = pd.concat(dfs, ignore_index=True)
all_data_fluB['label'] = all_data_fluB['label'].fillna('none')


In [None]:
for filename in all_data_fluB['filename'].unique():
    df_local = all_data_fluB[all_data_fluB['filename'] == filename]
    df_local.to_csv(f'./feature_store/{filename}', index=False)
    print(f'Saved {filename} with {len(df_local)} rows')


## RSV

In [None]:
virus_name = "RSV"  
tasks = filter_tasks_by_virus(label_data, virus_name)

dfs = []

for task in tasks:
    labels = task.get('label', [])
    task_filename = extract_filename(task)  # e.g. Mov-FluB-BEG.csv
    related_files = find_related_files(task_filename, virus_name)

    for file_path in related_files:
        df = pd.read_csv(file_path)
        df['filename'] = os.path.basename(file_path)
        df['virus'] = virus_name.upper()
        df = assign_labels_to_df(df, labels)
        dfs.append(df)

# Combine all processed data
all_data_rsv = pd.concat(dfs, ignore_index=True)
all_data_rsv['label'] = all_data_rsv['label'].fillna('none')


In [None]:
for filename in all_data_rsv['filename'].unique():
    df_local = all_data_rsv[all_data_rsv['filename'] == filename]
    df_local.to_csv(f'./feature_store/{filename}', index=False)
    print(f'Saved {filename} with {len(df_local)} rows')

## SARS-CoV-2

In [None]:
virus_name = "SARS"  
tasks = filter_tasks_by_virus(label_data, virus_name)

dfs = []

for task in tasks:
    labels = task.get('label', [])
    task_filename = extract_filename(task)  # e.g. Mov-SARS-BEG.csv
    related_files = find_related_files(task_filename, virus_name)

    for file_path in related_files:
        df = pd.read_csv(file_path)
        df['filename'] = os.path.basename(file_path)
        df['virus'] = virus_name.upper()
        df = assign_labels_to_df(df, labels)
        dfs.append(df)

# Combine all processed data
all_data_sars = pd.concat(dfs, ignore_index=True)
all_data_sars['label'] = all_data_sars['label'].fillna('none')


In [None]:
for filename in all_data_sars['filename'].unique():
    df_local = all_data_sars[all_data_sars['filename'] == filename]
    df_local.to_csv(f'./feature_store/{filename}', index=False)
    print(f'Saved {filename} with {len(df_local)} rows')