In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from pathlib import Path
from functools import reduce

In [2]:
path_folder='../../DATAFLOW_2026_UET.EPOCH_0_AUTOSCALING_ANALYSIS/features'
def read_pa(path_folder):
    files= list(Path(path_folder).glob('*.parquet'))
    for f in files:
        df=pd.read_parquet(f)
        print (df.head(2))
    return
read_pa(path_folder)

   index visitor_type  is_commercial  is_unknown  visitor_type_freq  \
0      0      unknown              0           1           0.253658   
1      1      network              0           0           0.086453   

  visitor_country  country_freq  is_top_country  
0   other_country      0.855195               0  
1   other_country      0.855195               0  
   index           timestamp  hour  day  weekday  month  is_weekend
0      0 1995-07-01 00:00:01     0    1        5      7           1
1      1 1995-07-01 00:00:06     0    1        5      7           1
   index reqmethod         reqdirectory reqresourcetype  reqpathdepth  \
0      0       GET     /history/apollo/       Directory             3   
1      1       GET  /shuttle/countdown/       Directory             3   

   is_dynamic  requrllength  
0           0            16  
1           0            19  
   index  response response_class
0      0       200     successful
1      1       200     successful
   index  bytes
0   

In [3]:
path_folder='../../DATAFLOW_2026_UET.EPOCH_0_AUTOSCALING_ANALYSIS/features'
def join_columns(path_folder,mode: str="merge",key: str="index",drop_index_col: bool = False,how: str="outer"):
    files= list(Path(path_folder).glob('*.parquet'))

    dfs =[]
    for f in files:
        df=pd.read_parquet(f)
        if mode == "concat":
            # Nếu cột "index" chỉ là cột thừa do save file thì bỏ
            if drop_index_col and key in df.columns:
                df = df.drop(columns=[key])
            dfs.append(df)

        elif mode == "merge":
            if key not in df.columns:
                raise ValueError(f"File {f.name} không có cột key '{key}'")

            dfs.append(df)

        else:
            raise ValueError("mode chỉ nhận 'concat' hoặc 'merge'")

    if mode == "concat":
        out = pd.concat(dfs, ignore_index=True)
    else:
        out = reduce(lambda a, b: pd.merge(a, b, on=key, how=how), dfs)
    return out
df=join_columns(path_folder)


In [4]:
df.head(5)

Unnamed: 0,index,visitor_type,is_commercial,is_unknown,visitor_type_freq,visitor_country,country_freq,is_top_country,timestamp,hour,...,is_weekend,reqmethod,reqdirectory,reqresourcetype,reqpathdepth,is_dynamic,requrllength,response,response_class,bytes
0,0,unknown,0,1,0.253658,other_country,0.855195,0,1995-07-01 00:00:01,0,...,1,GET,/history/apollo/,Directory,3,0,16,200,successful,6.245
1,1,network,0,0,0.086453,other_country,0.855195,0,1995-07-01 00:00:06,0,...,1,GET,/shuttle/countdown/,Directory,3,0,19,200,successful,3.985
2,2,unknown,0,1,0.253658,other_country,0.855195,0,1995-07-01 00:00:09,0,...,1,GET,/shuttle/missions/sts-73/,Page,4,0,44,200,successful,4.085
3,3,commercial,1,0,0.299574,other_country,0.855195,0,1995-07-01 00:00:11,0,...,1,GET,/shuttle/countdown/,Page,3,0,31,304,redirection,0.0
4,4,unknown,0,1,0.253658,other_country,0.855195,0,1995-07-01 00:00:11,0,...,1,GET,/shuttle/missions/sts-73/,Image,4,0,47,200,successful,4.179


In [5]:
df.isnull().sum()

index                    0
visitor_type             0
is_commercial            0
is_unknown               0
visitor_type_freq        0
visitor_country          0
country_freq             0
is_top_country           0
timestamp                0
hour                     0
day                      0
weekday                  0
month                    0
is_weekend               0
reqmethod                0
reqdirectory             0
reqresourcetype          0
reqpathdepth             0
is_dynamic               0
requrllength             0
response                 0
response_class           0
bytes                28490
dtype: int64

In [6]:
df.dtypes

index                         int64
visitor_type                    str
is_commercial                 int64
is_unknown                    int64
visitor_type_freq           float64
visitor_country                 str
country_freq                float64
is_top_country                int64
timestamp            datetime64[us]
hour                          int32
day                           int32
weekday                       int32
month                         int32
is_weekend                    int64
reqmethod                       str
reqdirectory                    str
reqresourcetype                 str
reqpathdepth                  int64
is_dynamic                    int64
requrllength                  int64
response                      int16
response_class                  str
bytes                       float64
dtype: object

In [7]:
df["response_class"].value_counts()

response_class
successful      2642411
redirection      275274
client error      17150
server error         95
Name: count, dtype: int64

In [8]:
df[df['bytes'].isnull()]['response'].value_counts()

response
404    17028
302    11174
200      136
403      111
501       31
400       10
Name: count, dtype: int64

In [9]:
def add_bytes_imputed(df):
    """fill NaN"""
    df = df.copy()
    df["bytes"] = pd.to_numeric(df["bytes"], errors="coerce")
    df["bytes_imputed"] = df["bytes"]

    df.loc[(df["response_class"]=="redirection") & df["bytes_imputed"].isna(), "bytes_imputed"] = 0

    med = df.groupby("response")["bytes"].median()
    m5 = (df["response_class"]=="server error") & df["bytes_imputed"].isna()
    df.loc[m5, "bytes_imputed"] = df.loc[m5, "response"].map(med).fillna(0)
    return df
df=add_bytes_imputed(df)


In [10]:
def flag_missing_bytes(df):
    df["bytes_missing"] = df["bytes"].isna().astype("int8")
    return df
df=flag_missing_bytes(df)
df.head(5)


Unnamed: 0,index,visitor_type,is_commercial,is_unknown,visitor_type_freq,visitor_country,country_freq,is_top_country,timestamp,hour,...,reqdirectory,reqresourcetype,reqpathdepth,is_dynamic,requrllength,response,response_class,bytes,bytes_imputed,bytes_missing
0,0,unknown,0,1,0.253658,other_country,0.855195,0,1995-07-01 00:00:01,0,...,/history/apollo/,Directory,3,0,16,200,successful,6.245,6.245,0
1,1,network,0,0,0.086453,other_country,0.855195,0,1995-07-01 00:00:06,0,...,/shuttle/countdown/,Directory,3,0,19,200,successful,3.985,3.985,0
2,2,unknown,0,1,0.253658,other_country,0.855195,0,1995-07-01 00:00:09,0,...,/shuttle/missions/sts-73/,Page,4,0,44,200,successful,4.085,4.085,0
3,3,commercial,1,0,0.299574,other_country,0.855195,0,1995-07-01 00:00:11,0,...,/shuttle/countdown/,Page,3,0,31,304,redirection,0.0,0.0,0
4,4,unknown,0,1,0.253658,other_country,0.855195,0,1995-07-01 00:00:11,0,...,/shuttle/missions/sts-73/,Image,4,0,47,200,successful,4.179,4.179,0


In [11]:
df.columns

Index(['index', 'visitor_type', 'is_commercial', 'is_unknown',
       'visitor_type_freq', 'visitor_country', 'country_freq',
       'is_top_country', 'timestamp', 'hour', 'day', 'weekday', 'month',
       'is_weekend', 'reqmethod', 'reqdirectory', 'reqresourcetype',
       'reqpathdepth', 'is_dynamic', 'requrllength', 'response',
       'response_class', 'bytes', 'bytes_imputed', 'bytes_missing'],
      dtype='str')

In [12]:
df['bytes_missing'].value_counts()

bytes_missing
0    2906440
1      28490
Name: count, dtype: int64

In [13]:
df['bytes_imputed'].isnull().sum()

np.int64(17285)

In [14]:
def _entropy(s: pd.Series) -> float:
    s = s.dropna()
    if len(s) == 0:
        return 0.0
    p = s.value_counts(normalize=True)
    return float(-(p * np.log2(p + 1e-12)).sum())

def _top_share(s: pd.Series,exclude=("other_country",)) -> float:
    s = s.dropna()
    s1=s[~s.isin(exclude)]
    if len(s) == 0:
        return 0.0
    if len(s1)==0:
        return 0.0
    return float(s1.value_counts().iloc[0] / len(s))

def make_train_table(df: pd.DataFrame, freq: str) -> pd.DataFrame:
    d = df.copy()
    d["timestamp"] = pd.to_datetime(d["timestamp"], errors="coerce")
    d = d.dropna(subset=["timestamp"]).sort_values("timestamp").set_index("timestamp")

    #đảm bảo numeric
    d["bytes"] = pd.to_numeric(d["bytes"], errors="coerce")
    d["bytes_imputed"] = pd.to_numeric(d["bytes_imputed"], errors="coerce")
    d["response"] = pd.to_numeric(d["response"], errors="coerce")

    r=d.resample(freq)

    out = pd.DataFrame(index=r.size().index)

    # Labels 
    out["y_req"] = r["bytes"].size().astype("int64")
    out["y_bytes_imp"] = r["bytes_imputed"].sum()


    # Bytes quality 
    out["bytes_missing_rate"] = r["bytes_missing"].mean().fillna(0.0)
    out["bytes_all_missing"] = (out["bytes_missing_rate"] == 1.0).astype("int8")

    # Health / response mix 
    out["error_rate"] = (d["response"] >= 400).resample(freq).mean().fillna(0.0)
    out["server_error_rate"] = (d["response"] >= 500).resample(freq).mean().fillna(0.0)
    out["redirection_rate"] = ((d["response"] >= 300) & (d["response"] < 400)).resample(freq).mean().fillna(0.0)

    # Mix rates 
    out["dynamic_rate"] = r["is_dynamic"].mean().fillna(0.0)
    out["commercial_rate"] = r["is_commercial"].mean().fillna(0.0)
    out["unknown_rate"] = r["is_unknown"].mean().fillna(0.0)

    # URL/path 
    out["avg_url_len"] = r["requrllength"].mean()
    out["avg_path_depth"] = r["reqpathdepth"].mean()

    # Diversity / concentration 
    out["country_nunique"] = r["visitor_country"].nunique()
    out["dir_nunique"] = r["reqdirectory"].nunique()
    out["method_nunique"] = r["reqmethod"].nunique()

    out["top_country_share"] = r["visitor_country"].apply(_top_share).fillna(0.0)
    out["endpoint_entropy"] = r["reqdirectory"].apply(_entropy).fillna(0.0)

    #  Time features 
    out = out.reset_index()
    out["hour"] = out["timestamp"].dt.hour
    out["weekday"] = out["timestamp"].dt.weekday
    out["is_weekend"] = (out["weekday"] >= 5).astype("int8")

    return out

def make_1m_5m_15m_train(df: pd.DataFrame):
    return {
        "1m": make_train_table(df, "1min"),
        "5m": make_train_table(df, "5min"),
        "15m": make_train_table(df, "15min"),
    }


In [15]:
tables = make_1m_5m_15m_train(df)
df_1m, df_5m, df_15m = tables["1m"], tables["5m"], tables["15m"]

In [16]:
display(df_1m)
display(df_5m)
display(df_15m)

Unnamed: 0,timestamp,y_req,y_bytes_imp,bytes_missing_rate,bytes_all_missing,error_rate,server_error_rate,redirection_rate,dynamic_rate,commercial_rate,...,avg_url_len,avg_path_depth,country_nunique,dir_nunique,method_nunique,top_country_share,endpoint_entropy,hour,weekday,is_weekend
0,1995-07-01 00:00:00,42,608.453,0.000000,0,0.000000,0.0,0.071429,0.000000,0.261905,...,29.523810,2.904762,3,13,1,0.047619,2.884403,0,5,1
1,1995-07-01 00:01:00,61,910.128,0.016393,0,0.016393,0.0,0.049180,0.016393,0.442623,...,31.901639,3.032787,2,18,1,0.065574,3.411756,0,5,1
2,1995-07-01 00:02:00,57,628.556,0.035088,0,0.000000,0.0,0.192982,0.000000,0.385965,...,28.736842,3.035088,3,19,1,0.035088,3.715645,0,5,1
3,1995-07-01 00:03:00,71,1747.389,0.014085,0,0.000000,0.0,0.169014,0.000000,0.704225,...,32.070423,3.000000,2,21,1,0.014085,3.601396,0,5,1
4,1995-07-01 00:04:00,70,1383.149,0.000000,0,0.000000,0.0,0.057143,0.000000,0.542857,...,32.400000,2.885714,2,17,1,0.014286,3.079143,0,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76315,1995-08-22 23:55:00,13,319.726,0.000000,0,0.000000,0.0,0.000000,0.000000,0.615385,...,31.153846,2.615385,3,4,1,0.230769,1.546594,23,1,0
76316,1995-08-22 23:56:00,21,436.101,0.000000,0,0.000000,0.0,0.000000,0.000000,0.285714,...,31.190476,2.809524,3,10,1,0.142857,2.944395,23,1,0
76317,1995-08-22 23:57:00,29,334.910,0.000000,0,0.000000,0.0,0.137931,0.000000,0.551724,...,30.551724,3.000000,2,12,1,0.068966,3.147304,23,1,0
76318,1995-08-22 23:58:00,29,283.098,0.448276,0,0.448276,0.0,0.000000,0.000000,0.620690,...,45.448276,3.655172,3,12,1,0.137931,2.891790,23,1,0


Unnamed: 0,timestamp,y_req,y_bytes_imp,bytes_missing_rate,bytes_all_missing,error_rate,server_error_rate,redirection_rate,dynamic_rate,commercial_rate,...,avg_url_len,avg_path_depth,country_nunique,dir_nunique,method_nunique,top_country_share,endpoint_entropy,hour,weekday,is_weekend
0,1995-07-01 00:00:00,301,5277.675,0.013289,0,0.003322,0.0,0.109635,0.003322,0.491694,...,31.126246,2.973422,4,35,1,0.026578,3.866601,0,5,1
1,1995-07-01 00:05:00,267,5041.043,0.011236,0,0.000000,0.0,0.101124,0.007491,0.490637,...,31.501873,3.022472,4,38,1,0.044944,3.900038,0,5,1
2,1995-07-01 00:10:00,242,6111.846,0.033058,0,0.016529,0.0,0.165289,0.008264,0.388430,...,30.235537,3.107438,5,38,1,0.061983,3.999979,0,5,1
3,1995-07-01 00:15:00,282,4559.748,0.024823,0,0.014184,0.0,0.109929,0.017730,0.411348,...,31.063830,3.177305,4,49,2,0.049645,4.426225,0,5,1
4,1995-07-01 00:20:00,319,7262.385,0.006270,0,0.000000,0.0,0.075235,0.003135,0.592476,...,31.451411,3.131661,4,38,1,0.028213,3.977305,0,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15259,1995-08-22 23:35:00,127,2410.781,0.007874,0,0.000000,0.0,0.157480,0.015748,0.299213,...,30.488189,2.858268,4,35,1,0.228346,4.095580,23,1,0
15260,1995-08-22 23:40:00,119,3383.983,0.008403,0,0.000000,0.0,0.067227,0.016807,0.268908,...,27.352941,2.638655,3,25,1,0.210084,3.575257,23,1,0
15261,1995-08-22 23:45:00,128,1618.155,0.000000,0,0.000000,0.0,0.031250,0.031250,0.367188,...,32.617188,3.031250,4,33,1,0.101562,4.084003,23,1,0
15262,1995-08-22 23:50:00,137,1169.948,0.014599,0,0.014599,0.0,0.043796,0.014599,0.540146,...,29.532847,2.890511,2,33,1,0.087591,3.945751,23,1,0


Unnamed: 0,timestamp,y_req,y_bytes_imp,bytes_missing_rate,bytes_all_missing,error_rate,server_error_rate,redirection_rate,dynamic_rate,commercial_rate,...,avg_url_len,avg_path_depth,country_nunique,dir_nunique,method_nunique,top_country_share,endpoint_entropy,hour,weekday,is_weekend
0,1995-07-01 00:00:00,810,16430.564,0.018519,0,0.006173,0.0,0.123457,0.006173,0.460494,...,30.983951,3.029630,5,60,1,0.024691,4.102642,0,5,1
1,1995-07-01 00:15:00,945,18046.060,0.013757,0,0.005291,0.0,0.097354,0.007407,0.522751,...,31.714286,3.156614,7,69,2,0.017989,4.317121,0,5,1
2,1995-07-01 00:30:00,1006,22420.824,0.006958,0,0.003976,0.0,0.094433,0.006958,0.545726,...,33.388668,3.260437,7,63,1,0.050696,3.963939,0,5,1
3,1995-07-01 00:45:00,804,24088.038,0.013682,0,0.012438,0.0,0.077114,0.012438,0.534826,...,33.274876,3.259950,7,61,2,0.054726,4.222621,0,5,1
4,1995-07-01 01:00:00,750,17178.646,0.008000,0,0.002667,0.0,0.124000,0.020000,0.489333,...,32.257333,3.109333,6,64,1,0.048000,4.290682,1,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5083,1995-08-22 22:45:00,435,7308.498,0.002299,0,0.002299,0.0,0.110345,0.000000,0.340230,...,31.494253,2.981609,6,56,2,0.055172,4.057854,22,1,0
5084,1995-08-22 23:00:00,510,5480.335,0.009804,0,0.007843,0.0,0.125490,0.005882,0.282353,...,30.854902,3.009804,4,73,1,0.058824,4.665178,23,1,0
5085,1995-08-22 23:15:00,393,6420.996,0.000000,0,0.000000,0.0,0.058524,0.010178,0.422392,...,30.381679,2.944020,6,49,2,0.078880,4.155377,23,1,0
5086,1995-08-22 23:30:00,336,6792.788,0.011905,0,0.005952,0.0,0.107143,0.011905,0.324405,...,29.404762,2.770833,5,52,1,0.163690,4.090367,23,1,0


In [17]:
def create_label(df):
    df=df.copy()
    df["y_req_t1"]=df["y_req"].shift(-1)
    df["y_bytes_imp_t1"]=df["y_bytes_imp"].shift(-1)
    df=df.iloc[:-1].copy()
    return df

In [18]:
df_1m=create_label(df_1m)
df_5m=create_label(df_5m)
df_15m=create_label(df_15m)

In [19]:
df_1m.isnull().sum()

timestamp                0
y_req                    0
y_bytes_imp              0
bytes_missing_rate       0
bytes_all_missing        0
error_rate               0
server_error_rate        0
redirection_rate         0
dynamic_rate             0
commercial_rate          0
unknown_rate             0
avg_url_len           7852
avg_path_depth        7852
country_nunique          0
dir_nunique              0
method_nunique           0
top_country_share        0
endpoint_entropy         0
hour                     0
weekday                  0
is_weekend               0
y_req_t1                 0
y_bytes_imp_t1           0
dtype: int64

In [20]:
def flag_data_gap(df):
    df["flag_data_gap"]=df["avg_url_len"].isna().astype('int8')
    return df
df_1m=flag_data_gap(df_1m)
df_5m=flag_data_gap(df_5m)
df_15m=flag_data_gap(df_15m)
df_1m.columns

Index(['timestamp', 'y_req', 'y_bytes_imp', 'bytes_missing_rate',
       'bytes_all_missing', 'error_rate', 'server_error_rate',
       'redirection_rate', 'dynamic_rate', 'commercial_rate', 'unknown_rate',
       'avg_url_len', 'avg_path_depth', 'country_nunique', 'dir_nunique',
       'method_nunique', 'top_country_share', 'endpoint_entropy', 'hour',
       'weekday', 'is_weekend', 'y_req_t1', 'y_bytes_imp_t1', 'flag_data_gap'],
      dtype='str')

In [21]:
def fill_missing(df):
    df=df.copy()
    df["avg_url_len"]=df["avg_url_len"].fillna(0)
    df["avg_path_depth"]=df["avg_path_depth"].fillna(0)
    return df
df_1m_no_missing=fill_missing(df_1m)
df_5m_no_missing=fill_missing(df_5m)
df_15m_no_missing=fill_missing(df_15m)
print(df_1m_no_missing.isnull().sum())
df_1m_no_missing["flag_data_gap"].value_counts()


timestamp             0
y_req                 0
y_bytes_imp           0
bytes_missing_rate    0
bytes_all_missing     0
error_rate            0
server_error_rate     0
redirection_rate      0
dynamic_rate          0
commercial_rate       0
unknown_rate          0
avg_url_len           0
avg_path_depth        0
country_nunique       0
dir_nunique           0
method_nunique        0
top_country_share     0
endpoint_entropy      0
hour                  0
weekday               0
is_weekend            0
y_req_t1              0
y_bytes_imp_t1        0
flag_data_gap         0
dtype: int64


flag_data_gap
0    68467
1     7852
Name: count, dtype: int64

In [22]:
df_1m_no_missing.dtypes

timestamp             datetime64[us]
y_req                          int64
y_bytes_imp                  float64
bytes_missing_rate           float64
bytes_all_missing               int8
error_rate                   float64
server_error_rate            float64
redirection_rate             float64
dynamic_rate                 float64
commercial_rate              float64
unknown_rate                 float64
avg_url_len                  float64
avg_path_depth               float64
country_nunique                int64
dir_nunique                    int64
method_nunique                 int64
top_country_share            float64
endpoint_entropy             float64
hour                           int32
weekday                        int32
is_weekend                      int8
y_req_t1                     float64
y_bytes_imp_t1               float64
flag_data_gap                   int8
dtype: object

In [23]:
from typing import List
def create_time_features(df: pd.DataFrame)-> pd.DataFrame:
    df["sin_hour"] = np.sin(2 * np.pi * df["hour"] / 24.0)
    df["cos_hour"] = np.cos(2 * np.pi * df["hour"] / 24.0)
    df["sin_weekday"] = np.sin(2 * np.pi * df["weekday"] / 7.0)
    df["cos_weekday"] = np.cos(2 * np.pi * df["weekday"] / 7.0)
            
    return df
def select_features(df:pd.DataFrame,target_cols :List[str] =["y_req_t1","y_bytes_imp_t1"],gap_flag_col :str="flag_data_gap",)-> pd.DataFrame:
    df=df.sort_values("timestamp")
    candidate_features=[
        "timestamp",
        *target_cols,

        "y_req",
        "y_bytes_imp",

        # Quality / missingness
        "bytes_missing_rate", 
        "bytes_all_missing",
                
        # Rates
        "error_rate", 
        "server_error_rate", 
        "redirection_rate",
        "dynamic_rate", 
        "commercial_rate", 
        "unknown_rate",
                
        # URL/Path
        "avg_url_len", 
        "avg_path_depth",
                
        # Diversity/Geo/Method
        "country_nunique", 
        "dir_nunique", 
        "method_nunique",
        "top_country_share", 
        "endpoint_entropy",
                
        # Time features
        "hour", 
        "weekday", 
        "is_weekend",
        "sin_hour", 
        "cos_hour",
        "sin_weekday",
        "cos_weekday",
                
        # Gap flag
        gap_flag_col,
        ]
    print(f"select {len(candidate_features)} columns")
    df=df[candidate_features]
    return df
def preprocess(df:pd.DataFrame,):
    df=df.copy()

    df=create_time_features(df)
    df=select_features(df)

    return df
df_1m_preprocessed=preprocess(df_1m_no_missing)
df_5m_preprocessed=preprocess(df_5m_no_missing)
df_15m_preprocessed=preprocess(df_15m_no_missing)    
df_1m_preprocessed.head(5) 

select 28 columns
select 28 columns
select 28 columns


Unnamed: 0,timestamp,y_req_t1,y_bytes_imp_t1,y_req,y_bytes_imp,bytes_missing_rate,bytes_all_missing,error_rate,server_error_rate,redirection_rate,...,top_country_share,endpoint_entropy,hour,weekday,is_weekend,sin_hour,cos_hour,sin_weekday,cos_weekday,flag_data_gap
0,1995-07-01 00:00:00,61.0,910.128,42,608.453,0.0,0,0.0,0.0,0.071429,...,0.047619,2.884403,0,5,1,0.0,1.0,-0.974928,-0.222521,0
1,1995-07-01 00:01:00,57.0,628.556,61,910.128,0.016393,0,0.016393,0.0,0.04918,...,0.065574,3.411756,0,5,1,0.0,1.0,-0.974928,-0.222521,0
2,1995-07-01 00:02:00,71.0,1747.389,57,628.556,0.035088,0,0.0,0.0,0.192982,...,0.035088,3.715645,0,5,1,0.0,1.0,-0.974928,-0.222521,0
3,1995-07-01 00:03:00,70.0,1383.149,71,1747.389,0.014085,0,0.0,0.0,0.169014,...,0.014085,3.601396,0,5,1,0.0,1.0,-0.974928,-0.222521,0
4,1995-07-01 00:04:00,54.0,933.859,70,1383.149,0.0,0,0.0,0.0,0.057143,...,0.014286,3.079143,0,5,1,0.0,1.0,-0.974928,-0.222521,0


In [24]:
df_1m_preprocessed.columns

Index(['timestamp', 'y_req_t1', 'y_bytes_imp_t1', 'y_req', 'y_bytes_imp',
       'bytes_missing_rate', 'bytes_all_missing', 'error_rate',
       'server_error_rate', 'redirection_rate', 'dynamic_rate',
       'commercial_rate', 'unknown_rate', 'avg_url_len', 'avg_path_depth',
       'country_nunique', 'dir_nunique', 'method_nunique', 'top_country_share',
       'endpoint_entropy', 'hour', 'weekday', 'is_weekend', 'sin_hour',
       'cos_hour', 'sin_weekday', 'cos_weekday', 'flag_data_gap'],
      dtype='str')

In [25]:
def ml_data(df ,freq:str):
    df=df.copy()

    #lag/rolling
    cols=[
        "y_req","y_bytes_imp","bytes_missing_rate","error_rate","top_country_share","endpoint_entropy"
    ]


    cfg={
        "1m":{"lags":(1,2,3,5,15),"windows":(5,15,30)},
        "5m":{"lags":(1,2,3,6),"windows":(6,12,24)},
        "15m":{"lags":(1,2,3),"windows":(3,6,12)}
    }
    #lags
    
    lags=cfg[freq]["lags"]
    for c in cols:
        if c not in df.columns :
            continue
        for l in lags:
            df[f"{c}_lag{l}"]=df[c].shift(l)

    #rolling mean/std

    windows=cfg[freq]["windows"]
    for c in cols:
        if c not in df.columns:
            continue
        s=df[c].shift(1)
        for w in windows :
            df[f"{c}_rmean{w}"]=s.rolling(w,min_periods=1).mean()
            df[f"{c}_rstd{w}"]  = s.rolling(w, min_periods=2).std()
    
    #drop,clean
    df=df[(df["flag_data_gap"]==0)&(df["bytes_missing_rate"]<=0.2)].copy()
    df=df.dropna()
    return df


In [26]:
df_1m_ml=ml_data(df_1m_preprocessed,"1m")
df_5m_ml=ml_data(df_1m_preprocessed,"5m")
df_15m_ml=ml_data(df_1m_preprocessed,"15m")

In [27]:
df_1m_ml

Unnamed: 0,timestamp,y_req_t1,y_bytes_imp_t1,y_req,y_bytes_imp,bytes_missing_rate,bytes_all_missing,error_rate,server_error_rate,redirection_rate,...,top_country_share_rmean15,top_country_share_rstd15,top_country_share_rmean30,top_country_share_rstd30,endpoint_entropy_rmean5,endpoint_entropy_rstd5,endpoint_entropy_rmean15,endpoint_entropy_rstd15,endpoint_entropy_rmean30,endpoint_entropy_rstd30
15,1995-07-01 00:15:00,82.0,1172.487,5,44.218,0.000000,0,0.000000,0.0,0.000000,...,0.065682,0.052327,0.065682,0.052327,3.444925,0.262215,3.352250,0.410140,3.352250,0.410140
16,1995-07-01 00:16:00,52.0,1602.189,82,1172.487,0.012195,0,0.000000,0.0,0.158537,...,0.075841,0.062393,0.074077,0.060689,3.139077,0.729177,3.288085,0.542494,3.262855,0.533727
17,1995-07-01 00:17:00,63.0,777.872,52,1602.189,0.019231,0,0.000000,0.0,0.096154,...,0.072282,0.064507,0.070437,0.060648,3.059825,0.661914,3.285540,0.541961,3.269368,0.517476
18,1995-07-01 00:18:00,80.0,962.982,63,777.872,0.063492,0,0.063492,0.0,0.063492,...,0.072507,0.064373,0.068660,0.059318,3.230637,0.766875,3.299393,0.556210,3.305706,0.525164
19,1995-07-01 00:19:00,80.0,1377.856,80,962.982,0.012500,0,0.000000,0.0,0.112500,...,0.075801,0.062404,0.068388,0.057659,3.375511,0.858073,3.330928,0.587106,3.346164,0.539978
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76313,1995-08-22 23:53:00,39.0,259.912,19,25.754,0.052632,0,0.052632,0.0,0.052632,...,0.170214,0.154167,0.186233,0.137018,3.074939,0.319537,2.965468,0.350851,2.845725,0.478232
76314,1995-08-22 23:54:00,13.0,319.726,39,259.912,0.000000,0,0.000000,0.0,0.025641,...,0.178446,0.158662,0.193426,0.137995,3.041129,0.334451,2.915376,0.302663,2.826203,0.464740
76315,1995-08-22 23:55:00,21.0,436.101,13,319.726,0.000000,0,0.000000,0.0,0.000000,...,0.152235,0.143313,0.192105,0.139195,2.984300,0.282020,2.940220,0.301253,2.838462,0.466970
76316,1995-08-22 23:56:00,29.0,334.910,21,436.101,0.000000,0,0.000000,0.0,0.000000,...,0.161823,0.143445,0.193915,0.139338,2.783714,0.706211,2.863898,0.467782,2.808668,0.518872


In [28]:
df_1m_ml.columns

Index(['timestamp', 'y_req_t1', 'y_bytes_imp_t1', 'y_req', 'y_bytes_imp',
       'bytes_missing_rate', 'bytes_all_missing', 'error_rate',
       'server_error_rate', 'redirection_rate', 'dynamic_rate',
       'commercial_rate', 'unknown_rate', 'avg_url_len', 'avg_path_depth',
       'country_nunique', 'dir_nunique', 'method_nunique', 'top_country_share',
       'endpoint_entropy', 'hour', 'weekday', 'is_weekend', 'sin_hour',
       'cos_hour', 'sin_weekday', 'cos_weekday', 'flag_data_gap', 'y_req_lag1',
       'y_req_lag2', 'y_req_lag3', 'y_req_lag5', 'y_req_lag15',
       'y_bytes_imp_lag1', 'y_bytes_imp_lag2', 'y_bytes_imp_lag3',
       'y_bytes_imp_lag5', 'y_bytes_imp_lag15', 'bytes_missing_rate_lag1',
       'bytes_missing_rate_lag2', 'bytes_missing_rate_lag3',
       'bytes_missing_rate_lag5', 'bytes_missing_rate_lag15',
       'error_rate_lag1', 'error_rate_lag2', 'error_rate_lag3',
       'error_rate_lag5', 'error_rate_lag15', 'top_country_share_lag1',
       'top_country_share_

In [29]:
df_1m_ml.to_parquet("../../DATAFLOW_2026_UET.EPOCH_0_AUTOSCALING_ANALYSIS/data/model_ml/train_1m.parquet")
df_5m_ml.to_parquet("../../DATAFLOW_2026_UET.EPOCH_0_AUTOSCALING_ANALYSIS/data/model_ml/train_5m.parquet")
df_15m_ml.to_parquet("../../DATAFLOW_2026_UET.EPOCH_0_AUTOSCALING_ANALYSIS/data/model_ml/train_15m.parquet")