In [2]:
import pandas as pd
import numpy as np
import os
import zipfile

zip_path = '../data/processed/data_processed.zip'
folder_extract = '../data/temp_extracted' 
folder_clean_data = '../data/cleaned' 

if not os.path.exists(folder_clean_data): os.makedirs(folder_clean_data)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(folder_extract)

In [3]:
cols_raw_log = ['host', 'timestamp', 'request', 'response', 'bytes']
dtype_cols_raw_log = {'host': str, 'timestamp': 'datetime64[ns]', 'request': str, 'response': str, 'bytes': 'int64'}

In [4]:
def apply_data_type(df_logs):
    # Ép kiểu timestamp: Nếu CSV của bạn đã là dạng ngày tháng thì nó sẽ chạy nhanh hơn
    # Nếu vẫn là dạng chuỗi (01/Jul/1995:00:00:01), format này vẫn đúng
    
    df_logs['timestamp'] = df_logs['timestamp'].astype(str).str.strip()
    # Dùng Regex để TRÍCH XUẤT đúng cụm: 2 số/3 chữ/4 số:2 số:2 số:2 số
    # Cách này sẽ bỏ qua phần "-0400" hoặc bất kỳ ký tự rác nào ở sau
    df_logs['timestamp'] = df_logs['timestamp'].str.extract(r'(\d{2}/\w{3}/\d{4}:\d{2}:\d{2}:\d{2})')

    df_logs['timestamp'] = pd.to_datetime(df_logs['timestamp'], format='%d/%b/%Y:%H:%M:%S', errors='coerce')
    
    # Xử lý cột bytes (fillna -1 và ép kiểu số)
    df_logs['bytes'] = pd.to_numeric(df_logs['bytes'], errors='coerce').fillna(-1)
    
    # Ép kiểu theo dictionary bạn đã định nghĩa
    df_logs = df_logs.astype(dtype_cols_raw_log)
    return df_logs

In [5]:
def load_single_csv(file_path):
    if not os.path.exists(file_path):
        print(f"Lỗi: Không tìm thấy file tại {file_path}")
        return None
    
    df_log = pd.read_csv(file_path, index_col=0)
    
    df_log = apply_data_type(df_log)
    
    df_log = df_log.reset_index(drop=True)
    
    print(f"Đã xử lý xong: {file_path} | Số dòng: {len(df_log)}")
    return df_log

In [24]:
path_train = os.path.join(folder_extract, 'train.csv')
path_test = os.path.join(folder_extract, 'test.csv')

df_train = load_single_csv(path_train)
df_test = load_single_csv(path_test)


Đã xử lý xong: ../data/temp_extracted\train.csv | Số dòng: 2934930
Đã xử lý xong: ../data/temp_extracted\test.csv | Số dòng: 526644


In [25]:
df_train = df_train.reset_index()
df_test = df_test.reset_index()

In [26]:
df_train

Unnamed: 0,index,host,timestamp,request,response,bytes
0,0,199.72.81.55,1995-07-01 00:00:01,GET /history/apollo/ HTTP/1.0,200,6245
1,1,unicomp6.unicomp.net,1995-07-01 00:00:06,GET /shuttle/countdown/ HTTP/1.0,200,3985
2,2,199.120.110.21,1995-07-01 00:00:09,GET /shuttle/missions/sts-73/mission-sts-73.ht...,200,4085
3,3,burger.letters.com,1995-07-01 00:00:11,GET /shuttle/countdown/liftoff.html HTTP/1.0,304,0
4,4,199.120.110.21,1995-07-01 00:00:11,GET /shuttle/missions/sts-73/sts-73-patch-smal...,200,4179
...,...,...,...,...,...,...
2934925,2934925,sfsp129.slip.net,1995-08-22 23:59:55,GET /images/ksclogo-medium.gif HTTP/1.0,200,5866
2934926,2934926,sfsp129.slip.net,1995-08-22 23:59:57,GET /images/NASA-logosmall.gif HTTP/1.0,200,786
2934927,2934927,sfsp129.slip.net,1995-08-22 23:59:57,GET /images/MOSAIC-logosmall.gif HTTP/1.0,200,363
2934928,2934928,sfsp129.slip.net,1995-08-22 23:59:58,GET /images/USA-logosmall.gif HTTP/1.0,200,234


In [27]:
df_test.head()

Unnamed: 0,index,host,timestamp,request,response,bytes
0,0,ix-mia1-02.ix.netcom.com,1995-08-23 00:00:00,GET /ksc.html HTTP/1.0,200,7087
1,1,internet-gw.watson.ibm.com,1995-08-23 00:00:05,GET /history/apollo/pad-abort-test-2/pad-abort...,200,1292
2,2,ix-mia1-02.ix.netcom.com,1995-08-23 00:00:06,GET /images/ksclogo-medium.gif HTTP/1.0,200,5866
3,3,internet-gw.watson.ibm.com,1995-08-23 00:00:08,GET /history/apollo/pad-abort-test-2/pad-abort...,200,1625
4,4,internet-gw.watson.ibm.com,1995-08-23 00:00:10,GET /history/apollo/pad-abort-test-2/pad-abort...,404,0


In [29]:
df_train.to_csv("../data/cleaned/train.csv", index=False)
df_test.to_csv("../data/cleaned/test.csv", index=False)

In [30]:
with zipfile.ZipFile("../../DATAFLOW_2026_UET.EPOCH_0_AUTOSCALING_ANALYSIS/data/cleaned/data_cleaned.zip", "w", zipfile.ZIP_DEFLATED) as z:
    z.write("../../DATAFLOW_2026_UET.EPOCH_0_AUTOSCALING_ANALYSIS/data/cleaned/train.csv", arcname="train.csv")
    z.write("../../DATAFLOW_2026_UET.EPOCH_0_AUTOSCALING_ANALYSIS/data/cleaned/test.csv", arcname="test.csv")