In [1]:
import json

with open("traffic_features_v1.json", "r", encoding="utf-8") as f:
    data = json.load(f)

type(data)

dict

In [2]:
len(data)

3

In [3]:
data

{'meta': {'exportedAt': '2026-01-21T08:37:51.697Z',
  'featureDim': 14,
  'labeling': 'weak_label_rules_v1 (0=benign, 1=suspicious)'},
 'stats': {'total': 4046,
  'kept': 506,
  'suspicious': 67,
  'benign': 439,
  'skipped': 3540},
 'data': [{'ts': '2026-01-21T07:17:55.124Z',
   'url': 'https://www.naver.com/nvhaproxy/v2/pc/lazy?blockCodeList=PC-MEDIA-WRAPPER&target=PC-MEDIA-NEWS',
   'mime': 'text/html',
   'status': 200,
   'features': [178576,
    1,
    178576,
    0.15913112624316816,
    0.5289792581309919,
    0.2917413314219156,
    5.890366837148915,
    0,
    0,
    0,
    0,
    0,
    0,
    0],
   'label': 1,
   'ruleHits': {'hit_keywords': False,
    'hit_entropy_len': False,
    'hit_symbol_obf': True,
    'hit_iframe_script': False,
    'kwSum': 0},
   'bodySample': '{"PC-MEDIA-WRAPPER":{"blocks":[{"@type":"BLOCK","blocks":[{"@type":"BLOCK","blocks":null,"materials":[{"@type":"MATERIAL-PC-NEWS-ONELINE","gdid":null,"title":"국힘 \\"李대통령 회견, 노골적인 허위사실 유포…국민에 거짓말\\"","ur'}

# 데이터의 핵심 포인트 요약

* 샘플 단위 : 1 http 트래픽
* feature : 숫자 벡터(길이 14)
* label : 있음(0/1)
* 샘플 : 506
* 클래스: 불균형(1~ 13%)
-> 이진 분류 문제 확정


* features -> X
* label -> y

In [4]:
import pandas as pd

records = data["data"]

X = [r["features"] for r in records]
y = [r["label"] for r in records]

import numpy as np
X = np.array(X)
y = np.array(y)

X.shape, y.shape


((506, 14), (506,))

In [5]:
#값 범위가 너무 큰 feature 있는지 , 0 만 있는 컬럼 없는지
pd.DataFrame(X).describe()
# -> 값의 범위가 조금 큰 것이 보임. 스케일 맞추기로 하면 좋을 듯

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,68894.43,50.274704,14746.420451,0.064632,0.39746,0.462545,4.970667,0.081028,0.019763,0.369565,0.051383,0.142292,3.252964,0.045455
std,296477.2,199.022695,54939.948452,0.08538,0.221456,0.199621,0.482472,0.482727,0.187756,3.076545,0.276699,1.173769,14.944493,0.332868
min,55.0,1.0,9.625,0.004966,0.084592,0.06857,4.047908,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,182.5,2.0,26.855918,0.021799,0.177807,0.284964,4.60124,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,304.0,8.0,40.375,0.039576,0.318182,0.535527,4.83545,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,17458.0,8.0,3643.9,0.065382,0.615913,0.683544,5.343938,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,4157050.0,2154.0,639555.0,0.524438,0.774006,0.743295,6.288308,9.0,3.0,47.0,2.0,15.0,90.0,4.0


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)


In [7]:
#표준화

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [8]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(
    class_weight="balanced",
    max_iter=1000,
    random_state=42
)

model.fit(X_train_scaled, y_train)


In [9]:
#성능 평가 

from sklearn.metrics import classification_report, confusion_matrix

y_pred = model.predict(X_test_scaled)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[82  6]
 [ 1 13]]
              precision    recall  f1-score   support

           0       0.99      0.93      0.96        88
           1       0.68      0.93      0.79        14

    accuracy                           0.93       102
   macro avg       0.84      0.93      0.87       102
weighted avg       0.95      0.93      0.94       102



# RandomForest 학습 코드
* 비선형 패턴까지 포함하면 성능이 더 좋아지는지
* Logistic 대비 recall 유지 or 개선 되는지
* feature importance로 해석력 확보 

In [20]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    min_samples_leaf=2,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)


0,1,2
,"n_estimators  n_estimators: int, default=100 The number of trees in the forest. .. versionchanged:: 0.22  The default value of ``n_estimators`` changed from 10 to 100  in 0.22.",300
,"criterion  criterion: {""gini"", ""entropy"", ""log_loss""}, default=""gini"" The function to measure the quality of a split. Supported criteria are ""gini"" for the Gini impurity and ""log_loss"" and ""entropy"" both for the Shannon information gain, see :ref:`tree_mathematical_formulation`. Note: This parameter is tree-specific.",'gini'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.",
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: {""sqrt"", ""log2"", None}, int or float, default=""sqrt"" The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at each  split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. .. versionchanged:: 1.1  The default of `max_features` changed from `""auto""` to `""sqrt""`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.",'sqrt'
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0
,"bootstrap  bootstrap: bool, default=True Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.",True


In [24]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred_rf = rf.predict(X_test)

print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf, digits=3))




#정상 -> 정상 :88
#정상 -> 악성 :0 
# 악성 -> 정상 :2개
# 악성-> 악성:12

[[88  0]
 [ 2 12]]
              precision    recall  f1-score   support

           0      0.978     1.000     0.989        88
           1      1.000     0.857     0.923        14

    accuracy                          0.980       102
   macro avg      0.989     0.929     0.956       102
weighted avg      0.981     0.980     0.980       102



In [22]:
import pandas as pd

feature_importance = pd.Series(
    rf.feature_importances_,
    index=[f"f{i}" for i in range(X.shape[1])]
).sort_values(ascending=False)

feature_importance


f0     0.213687
f9     0.158267
f6     0.146026
f2     0.141517
f4     0.109628
f5     0.095822
f3     0.047226
f12    0.023664
f1     0.022343
f10    0.018113
f11    0.013407
f7     0.009723
f8     0.000348
f13    0.000229
dtype: float64

- 본 연구에서는 웹 트래픽에서 추출된 사전 정의 feature를 활용하였으며,
개별 feature의 계산식보다는 트래픽 행동을 요약하는 지표로서의 역할에
초점을 맞추어 해석을 수행하였다.
특히 RandomForest의 feature importance 분석을 통해
실제 분류에 기여한 feature들을 중심으로
사후적인 의미 해석(post-hoc interpretation)을 진행하였다.