In [1]:
import os
import sys
import copy
import logging
import time
import pandas as pd
import numpy as np

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.data.datahandler import DataHandler

In [2]:
input_file = os.path.join("D:\\", "projects", "nmf_py", "data", "Dataset-BatonRouge-con.csv")
uncertainty_file = os.path.join("D:\\", "projects", "nmf_py", "data", "Dataset-BatonRouge-unc.csv")
output_path = os.path.join("D:\\", "projects", "nmf_py", "output", "BatonRouge")
index_col = "Date"

In [3]:
conc_df = pd.read_csv(input_file)
unc_df = pd.read_csv(uncertainty_file)

c_df = conc_df.copy().drop("Date", axis=1)
u_df = unc_df.copy().drop("Date", axis=1)

In [4]:
min_con = c_df.min()
p25 = c_df.quantile(q=0.25, numeric_only=True)
median_con = c_df.median(numeric_only=True)
p75 = c_df.quantile(q=0.75, numeric_only=True)
max_con = c_df.max()

d = (c_df - u_df).divide(u_df, axis=0)
mask = c_df <= u_df
d.mask(mask, 0, inplace=True)
sn = (1/d.shape[0]) * d.sum(axis=0)

categories = ["Strong"] * d.shape[1]

input_metrics = pd.DataFrame(data={"Category": categories, "S/N": sn, "Min": min_con, "25th": p25, "50th": median_con, "75th": p75, "Max": max_con})
input_metrics

Unnamed: 0,Category,S/N,Min,25th,50th,75th,Max
124-Trimethylbenzene,Strong,5.445168,0.005,0.820001,1.290001,1.865001,5.470003
224-Trimethylpentane,Strong,5.666667,0.41,1.580001,2.490002,3.865002,13.560008
234-Trimethylpentane,Strong,5.537459,0.005,0.53,0.820001,1.300001,4.410003
23-Dimethylbutane,Strong,5.500543,0.005,0.64,1.110001,2.285001,10.500007
23-Dimethylpentane,Strong,5.463626,0.005,0.34,0.49,0.78,3.310002
2-Methylheptane,Strong,5.039088,0.005,0.215,0.33,0.535,2.480002
3-Methylhexane,Strong,5.648208,0.005,0.655,1.050001,1.510001,7.780005
3-Methylpentane,Strong,5.611292,0.54,1.720001,2.990002,5.945004,29.100018
Acetylene,Strong,5.666667,0.38,1.410001,1.990001,2.835002,8.070005
Benzene,Strong,5.666667,0.59,1.960001,2.770002,4.440003,9.330006


In [5]:
dh = DataHandler(
        input_path=input_file,
        uncertainty_path=uncertainty_file,
        output_path=output_path,
        index_col=index_col
    )
dh.metrics

10-Feb-23 11:01:26 - Input and output configured successfully


Unnamed: 0,Category,S/N,Min,25th,50th,75th,Max
124-Trimethylbenzene,Strong,5.445168,0.005,0.820001,1.290001,1.865001,5.470003
224-Trimethylpentane,Strong,5.666667,0.41,1.580001,2.490002,3.865002,13.560008
234-Trimethylpentane,Strong,5.537459,0.005,0.53,0.820001,1.300001,4.410003
23-Dimethylbutane,Strong,5.500543,0.005,0.64,1.110001,2.285001,10.500007
23-Dimethylpentane,Strong,5.463626,0.005,0.34,0.49,0.78,3.310002
2-Methylheptane,Strong,5.039088,0.005,0.215,0.33,0.535,2.480002
3-Methylhexane,Strong,5.648208,0.005,0.655,1.050001,1.510001,7.780005
3-Methylpentane,Strong,5.611292,0.54,1.720001,2.990002,5.945004,29.100018
Acetylene,Strong,5.666667,0.38,1.410001,1.990001,2.835002,8.070005
Benzene,Strong,5.666667,0.59,1.960001,2.770002,4.440003,9.330006


In [8]:
dh.remove_noisy()
dh.input_data_processed.shape

(307, 39)