In [1]:
import pandas as pd
import numpy as np

RAW = "../data/raw/"
OUT = "../data/processed/merged_cleaned.csv"

transformer = pd.read_csv(RAW + "eaf_transformer.csv")
temp = pd.read_csv(RAW + "eaf_temp.csv")
gas = pd.read_csv(RAW + "eaf_gaslance_mat.csv")

print("Raw files loaded")
print("Transformer:", transformer.shape)
print("Temp:", temp.shape)
print("Gas:", gas.shape)


Raw files loaded
Transformer: (271712, 5)
Temp: (85104, 4)
Gas: (5748194, 6)


In [2]:
tr = transformer.copy()
tr["HEATID"] = tr["HEATID"].astype(str)
tr["MW"] = tr["MW"].astype(str).str.replace(",", ".", regex=False)
tr["MW"] = pd.to_numeric(tr["MW"], errors="coerce")
tr["DURATION"] = tr["DURATION"].astype(str).str.replace(" ", "", regex=False).str.replace("::", ":", regex=False)
tr["DURATION_SEC"] = pd.to_timedelta(tr["DURATION"] + ":00", errors="coerce").dt.total_seconds()
tr = tr.dropna(subset=["HEATID", "DURATION_SEC", "MW"])
tr = tr[tr["DURATION_SEC"] > 0]
tr_agg = tr.groupby("HEATID").agg(
    TAP_first=("TAP", "first"),
    STARTTIME_first=("STARTTIME", "first"),
    DURATION_SEC=("DURATION_SEC", "sum"),
    MW_mean=("MW", "mean")
).reset_index()
print("Transformer aggregated:", tr_agg.shape)
tr_agg.head()


Transformer aggregated: (20813, 5)


Unnamed: 0,HEATID,TAP_first,STARTTIME_first,DURATION_SEC,MW_mean
0,5F0002,14,2015-01-01 01:12:00,60600.0,6.078125
1,5F0003,11,2015-01-01 01:41:00,120600.0,6.078125
2,5F0004,11,2015-01-01 02:26:00,123600.0,3.644231
3,5F0005,11,2015-01-01 03:27:00,128460.0,4.477273
4,5F0006,11,2015-01-01 04:16:00,126000.0,4.534091


In [3]:
tp = temp.copy()
tp["HEATID"] = tp["HEATID"].astype(str)
tp["DATETIME"] = pd.to_datetime(tp["DATETIME"], errors="coerce")
tp_agg = tp.groupby("HEATID").agg(
    TEMP_mean=("TEMP", "mean"),
    TEMP_p95=("TEMP", lambda x: np.nanpercentile(x, 95)),
    VALO2_mean=("VALO2_PPM", "mean"),
    VALO2_p95=("VALO2_PPM", lambda x: np.nanpercentile(x, 95))
).reset_index()
print("Temp aggregated:", tp_agg.shape)
tp_agg.head()


Temp aggregated: (20827, 5)


Unnamed: 0,HEATID,TEMP_mean,TEMP_p95,VALO2_mean,VALO2_p95
0,5F0002,1627.0,1648.3,383.0,400.7
1,5F0003,1641.0,1651.8,683.0,696.5
2,5F0004,1636.0,1636.0,700.0,700.0
3,5F0005,1630.0,1639.7,625.333333,663.6
4,5F0006,1622.0,1644.8,654.6,756.0


In [4]:
gs = gas.copy()
gs["HEATID"] = gs["HEATID"].astype(str)
cols_to_fix = ["O2_AMOUNT", "GAS_AMOUNT", "O2_FLOW", "GAS_FLOW"]
for col in cols_to_fix:
    gs[col] = gs[col].astype(str).str.replace(",", ".", regex=False)
    gs[col] = pd.to_numeric(gs[col], errors="coerce")
gs_agg = gs.groupby("HEATID").agg(
    O2_AMOUNT_sum=("O2_AMOUNT", "sum"),
    GAS_AMOUNT_sum=("GAS_AMOUNT", "sum"),
    O2_FLOW_mean=("O2_FLOW", "mean"),
    GAS_FLOW_mean=("GAS_FLOW", "mean")
).reset_index()
print("Gas aggregated:", gs_agg.shape)
gs_agg.head()


Gas aggregated: (20827, 5)


Unnamed: 0,HEATID,O2_AMOUNT_sum,GAS_AMOUNT_sum,O2_FLOW_mean,GAS_FLOW_mean
0,5F0002,428122.0,175048.0,3963.962264,1527.245283
1,5F0003,382714.0,156025.0,3771.399194,1556.326613
2,5F0004,606453.0,251531.0,3139.359882,1206.572271
3,5F0005,453178.0,179407.0,4009.954887,1532.003759
4,5F0006,460713.0,178593.0,3881.255556,1504.07037


In [5]:
df = tr_agg.merge(tp_agg, on="HEATID", how="left").merge(gs_agg, on="HEATID", how="left")
df = df.sort_values("STARTTIME_first").reset_index(drop=True)
print("Final merged df shape:", df.shape)
df.head()


Final merged df shape: (20813, 13)


Unnamed: 0,HEATID,TAP_first,STARTTIME_first,DURATION_SEC,MW_mean,TEMP_mean,TEMP_p95,VALO2_mean,VALO2_p95,O2_AMOUNT_sum,GAS_AMOUNT_sum,O2_FLOW_mean,GAS_FLOW_mean
0,5F0002,14,2015-01-01 01:12:00,60600.0,6.078125,1627.0,1648.3,383.0,400.7,428122.0,175048.0,3963.962264,1527.245283
1,5F0003,11,2015-01-01 01:41:00,120600.0,6.078125,1641.0,1651.8,683.0,696.5,382714.0,156025.0,3771.399194,1556.326613
2,5F0004,11,2015-01-01 02:26:00,123600.0,3.644231,1636.0,1636.0,700.0,700.0,606453.0,251531.0,3139.359882,1206.572271
3,5F0005,11,2015-01-01 03:27:00,128460.0,4.477273,1630.0,1639.7,625.333333,663.6,453178.0,179407.0,4009.954887,1532.003759
4,5F0006,11,2015-01-01 04:16:00,126000.0,4.534091,1622.0,1644.8,654.6,756.0,460713.0,178593.0,3881.255556,1504.07037


In [6]:
df.to_csv(OUT, index=False)
print("Cleaned dataset saved to:", OUT)


Cleaned dataset saved to: ../data/processed/merged_cleaned.csv


In [7]:
import pandas as pd
df = pd.read_csv("../data/processed/merged_cleaned.csv")
print(df.shape)
df.head()


(20813, 13)


Unnamed: 0,HEATID,TAP_first,STARTTIME_first,DURATION_SEC,MW_mean,TEMP_mean,TEMP_p95,VALO2_mean,VALO2_p95,O2_AMOUNT_sum,GAS_AMOUNT_sum,O2_FLOW_mean,GAS_FLOW_mean
0,5F0002,14,2015-01-01 01:12:00,60600.0,6.078125,1627.0,1648.3,383.0,400.7,428122.0,175048.0,3963.962264,1527.245283
1,5F0003,11,2015-01-01 01:41:00,120600.0,6.078125,1641.0,1651.8,683.0,696.5,382714.0,156025.0,3771.399194,1556.326613
2,5F0004,11,2015-01-01 02:26:00,123600.0,3.644231,1636.0,1636.0,700.0,700.0,606453.0,251531.0,3139.359882,1206.572271
3,5F0005,11,2015-01-01 03:27:00,128460.0,4.477273,1630.0,1639.7,625.333333,663.6,453178.0,179407.0,4009.954887,1532.003759
4,5F0006,11,2015-01-01 04:16:00,126000.0,4.534091,1622.0,1644.8,654.6,756.0,460713.0,178593.0,3881.255556,1504.07037
