In [28]:
from fastai.tabular import *
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, GridSearchCV,KFold, cross_val_score
import torch
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from scipy.sparse import hstack
from tqdm import tqdm
from sklearn.utils.multiclass import unique_labels
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
benign_data_path = '/content/drive/MyDrive/[졸프]데이터_csv/benign_data_1030.csv'
malicious_data_path = '/content/drive/MyDrive/[졸프]데이터_csv/malicious_data_1030.csv'

benign_data = pd.read_csv(benign_data_path)
malicious_data = pd.read_csv(malicious_data_path)

In [4]:
data = pd.concat([benign_data, malicious_data],axis=0, ignore_index=True)

In [5]:
data_fillna = data.fillna(-1)

In [6]:
X = data_fillna[['url_length','character_count','https','ttl', 'ns_domain_match', 'IP/email', 'short_url','file_extension']]
y = data_fillna['label']

In [7]:
additional_features = X[['ttl']].values
scaler = StandardScaler()
additional_features = scaler.fit_transform(additional_features)
# additional_features를 DataFrame으로 변환 (열 이름 지정)
additional_features_df = pd.DataFrame(additional_features, columns=['ttl_scaled'])

# X와 additional_features_df를 수평으로 결합
X_combined = pd.concat([X, additional_features_df], axis=1)

# 결과 확인
print(X_combined.head())

   url_length  character_count  https      ttl  ns_domain_match  IP/email  \
0          22                3      1    600.0              0.0         0   
1          18                3      1    300.0              0.0         0   
2          19                3      1  86400.0              0.0         0   
3          17                3      1    300.0              0.0         0   
4          17                3      1    300.0              0.0         0   

   short_url  file_extension  ttl_scaled  
0          0               0   -0.194590  
1          0               0   -0.209532  
2          0               0    4.078992  
3          0               0   -0.209532  
4          0               0   -0.209532  


In [8]:
X_combined.drop(columns=['ttl'], inplace=True)

In [15]:
X_combined

Unnamed: 0,url_length,character_count,https,ns_domain_match,IP/email,short_url,file_extension,ttl_scaled
0,22,3,1,0.0,0,0,0,-0.194590
1,18,3,1,0.0,0,0,0,-0.209532
2,19,3,1,0.0,0,0,0,4.078992
3,17,3,1,0.0,0,0,0,-0.209532
4,17,3,1,0.0,0,0,0,-0.209532
...,...,...,...,...,...,...,...,...
243769,44,10,0,-1.0,0,0,1,-0.224525
243770,38,8,0,-1.0,1,0,1,-0.224525
243771,31,5,0,-1.0,0,0,1,-0.224525
243772,92,13,1,-1.0,0,1,1,-0.164854


In [17]:
y

Unnamed: 0,label
0,0
1,0
2,0
3,0
4,0
...,...
243769,1
243770,1
243771,1
243772,1


#### 기울기, 편향 계산
  - Gradient -> 모델이 얼마나 정확하게 예측했는지를 보여주는 지표, 기울기가 작을수록 모델이 더 정확
  - Bias -> 편향은 모델이 예측한 값과 실제 값의 차이의 평균값. 편향이 0에 가까운 값일수록, 모델이 실제값을 잘 예측

In [69]:
# 기울기와 편향 계산 함수
def calculate_bias_and_gradient(y_true, y_pred):
    # 잔차 계산 (Residuals)
    residuals = y_true - y_pred

    # 기울기: 평균 절대 오차 (MAE, Mean Absolute Error)
    mae = np.mean(np.abs(residuals))

    # 편향: 예측값과 실제값의 차이의 평균
    bias = np.mean(residuals)

    return mae, bias

### 교차검증


StratifiedKFold

In [24]:
# Stratified K-Fold
skf = StratifiedKFold(n_splits=5)


X_combined_numpy = np.array(X_combined)  # NumPy 배열로 변환
y_numpy = np.array(y)

#### 모델정의

In [25]:
#로지스틱
model = LogisticRegression(max_iter=1000, solver='lbfgs', class_weight='balanced')
#결정트리
model_dt = DecisionTreeClassifier(random_state=42)
#랜덤포레스트
model_rf = RandomForestClassifier(random_state=42)


#### 교차검증

In [29]:
scores_logi = {"accuracy": [], "precision": [], "recall": [], "f1": [], "roc_auc": []}
scores_dt = {"accuracy": [], "precision": [], "recall": [], "f1": [], "roc_auc": []}
scores_rf = {"accuracy": [], "precision": [], "recall": [], "f1": [], "roc_auc": []}

In [70]:
for train_idx, test_idx in skf.split(X_combined_numpy, y_numpy):
    X_train, X_test = X_combined_numpy[train_idx], X_combined_numpy[test_idx]
    y_train, y_test = y_numpy[train_idx], y_numpy[test_idx]

    # 로지스틱 회귀 모델
    model.fit(X_train, y_train)
    y_pred_logi = model.predict(X_test)
    y_pred_proba_logi = model.predict_proba(X_test)[:, 1]  # 확률 예측 (ROC-AUC용)

    scores_logi["accuracy"].append(accuracy_score(y_test, y_pred_logi))
    scores_logi["precision"].append(precision_score(y_test, y_pred_logi))
    scores_logi["recall"].append(recall_score(y_test, y_pred_logi))
    scores_logi["f1"].append(f1_score(y_test, y_pred_logi))
    scores_logi["roc_auc"].append(roc_auc_score(y_test, y_pred_proba_logi))

    # 기울기와 편향 계산
    mae_logi, bias_logi = calculate_bias_and_gradient(y_test, y_pred_logi)
    print(f"로지스틱 회귀 모델 - 기울기: {mae_logi:.4f}, 편향: {bias_logi:.4f}")






    # 결정 트리 모델
    model_dt.fit(X_train, y_train)
    y_pred_dt = model_dt.predict(X_test)
    y_pred_proba_dt = model_dt.predict_proba(X_test)[:, 1]

    scores_dt["accuracy"].append(accuracy_score(y_test, y_pred_dt))
    scores_dt["precision"].append(precision_score(y_test, y_pred_dt))
    scores_dt["recall"].append(recall_score(y_test, y_pred_dt))
    scores_dt["f1"].append(f1_score(y_test, y_pred_dt))
    scores_dt["roc_auc"].append(roc_auc_score(y_test, y_pred_proba_dt))

    # 기울기와 편향 계산
    mae_dt, bias_dt = calculate_bias_and_gradient(y_test, y_pred_dt)
    print(f"결정 트리 모델 - 기울기: {mae_dt:.4f}, 편향: {bias_dt:.4f}")







    # 랜덤 포레스트 모델
    model_rf.fit(X_train, y_train)
    y_pred_rf = model_rf.predict(X_test)
    y_pred_proba_rf = model_rf.predict_proba(X_test)[:, 1]

    scores_rf["accuracy"].append(accuracy_score(y_test, y_pred_rf))
    scores_rf["precision"].append(precision_score(y_test, y_pred_rf))
    scores_rf["recall"].append(recall_score(y_test, y_pred_rf))
    scores_rf["f1"].append(f1_score(y_test, y_pred_rf))
    scores_rf["roc_auc"].append(roc_auc_score(y_test, y_pred_proba_rf))

    # 기울기와 편향 계산
    mae_rf, bias_rf = calculate_bias_and_gradient(y_test, y_pred_rf)
    print(f"랜덤 포레스트 모델 - 기울기: {mae_rf:.4f}, 편향: {bias_rf:.4f}")



# 평균 점수 계산 및 출력
def print_scores(model_name, scores):
    print(f"\n{model_name} 교차 검증 점수:")
    for metric, values in scores.items():
        print(f"{metric.capitalize()} 평균: {sum(values) / len(values):.4f}")

print_scores("로지스틱 회귀", scores_logi)
print_scores("결정 트리", scores_dt)
print_scores("랜덤 포레스트", scores_rf)

로지스틱 회귀 모델 - 기울기: 0.0108, 편향: -0.0039
결정 트리 모델 - 기울기: 0.0082, 편향: 0.0035
랜덤 포레스트 모델 - 기울기: 0.0070, 편향: 0.0022
로지스틱 회귀 모델 - 기울기: 0.0379, 편향: -0.0256
결정 트리 모델 - 기울기: 0.0193, 편향: -0.0065
랜덤 포레스트 모델 - 기울기: 0.0198, 편향: -0.0081
로지스틱 회귀 모델 - 기울기: 0.0249, 편향: -0.0022
결정 트리 모델 - 기울기: 0.0129, 편향: 0.0056
랜덤 포레스트 모델 - 기울기: 0.0125, 편향: 0.0050
로지스틱 회귀 모델 - 기울기: 0.0229, 편향: -0.0088
결정 트리 모델 - 기울기: 0.0126, 편향: 0.0038
랜덤 포레스트 모델 - 기울기: 0.0121, 편향: 0.0031
로지스틱 회귀 모델 - 기울기: 0.1284, 편향: 0.1091
결정 트리 모델 - 기울기: 0.1012, 편향: 0.0935
랜덤 포레스트 모델 - 기울기: 0.0923, 편향: 0.0845

로지스틱 회귀 교차 검증 점수:
Accuracy 평균: 0.9550
Precision 평균: 0.9588
Recall 평균: 0.9239
F1 평균: 0.9367
Roc_auc 평균: 0.9908

결정 트리 교차 검증 점수:
Accuracy 평균: 0.9692
Precision 평균: 0.9854
Recall 평균: 0.9342
F1 평균: 0.9564
Roc_auc 평균: 0.9628

랜덤 포레스트 교차 검증 점수:
Accuracy 평균: 0.9713
Precision 평균: 0.9847
Recall 평균: 0.9403
F1 평균: 0.9598
Roc_auc 평균: 0.9871


In [37]:

# 참고)
# Accuracy: 전체 예측 중에서 맞춘 비율, 데이터 불균형 상황에서는 부적합. (다수 클래스의 비율이 높을수록 왜곡됨)
# Precision: 양성 클래스로 예측한 것 중 실제로 양성 클래스인 비율. FP를 최소화하는 데 초점.
# Recall: 실제 양성인 데이터 중에서 모델이 양성으로 정확히 예측한 비율. FN를 최소화하는 데 초점.
# F1 score: Precision과 Recall의 조화 평균. 데이터 불균형 상황에서 적합.
# ROC-AUC: TPR (True Positive Rate, Recall)과 FPR (False Positive Rate) 간의 관계를 나타내는 곡선 아래 면적.
#           이진 분류에서 모델의 판별 능력을 측정. 모델의 분류 능력을 평가하는 종합적인 지표.



### 다운샘플링

##### 결측값 -1로 처리

In [39]:
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

In [40]:
undersampler = RandomUnderSampler(random_state=42)

In [41]:
X_resampled, y_resampled = undersampler.fit_resample(X_combined, y)

In [42]:
print(f"샘플링 전 클래스 분포: {Counter(y)}")
print(f"샘플링 후 클래스 분포: {Counter(y_resampled)}")

샘플링 전 클래스 분포: Counter({0: 149707, 1: 94067})
샘플링 후 클래스 분포: Counter({0: 94067, 1: 94067})


In [46]:
X_resampled_numpy = np.array(X_resampled)  # NumPy 배열로 변환
y_resampled_numpy = np.array(y_resampled)

In [47]:
scores_logi_down = {"accuracy": [], "precision": [], "recall": [], "f1": [], "roc_auc": []}
scores_dt_down = {"accuracy": [], "precision": [], "recall": [], "f1": [], "roc_auc": []}
scores_rf_down = {"accuracy": [], "precision": [], "recall": [], "f1": [], "roc_auc": []}

In [48]:
for train_idx, test_idx in skf.split(X_resampled_numpy, y_resampled_numpy):
    X_train, X_test = X_resampled_numpy[train_idx], X_resampled_numpy[test_idx]
    y_train, y_test = y_resampled_numpy[train_idx], y_resampled_numpy[test_idx]

    # 로지스틱 회귀 모델
    model.fit(X_train, y_train)
    y_pred_logi = model.predict(X_test)
    y_pred_proba_logi = model.predict_proba(X_test)[:, 1]  # 확률 예측 (ROC-AUC용)

    scores_logi_down["accuracy"].append(accuracy_score(y_test, y_pred_logi))
    scores_logi_down["precision"].append(precision_score(y_test, y_pred_logi))
    scores_logi_down["recall"].append(recall_score(y_test, y_pred_logi))
    scores_logi_down["f1"].append(f1_score(y_test, y_pred_logi))
    scores_logi_down["roc_auc"].append(roc_auc_score(y_test, y_pred_proba_logi))

    # 결정 트리 모델
    model_dt.fit(X_train, y_train)
    y_pred_dt = model_dt.predict(X_test)
    y_pred_proba_dt = model_dt.predict_proba(X_test)[:, 1]

    scores_dt_down["accuracy"].append(accuracy_score(y_test, y_pred_dt))
    scores_dt_down["precision"].append(precision_score(y_test, y_pred_dt))
    scores_dt_down["recall"].append(recall_score(y_test, y_pred_dt))
    scores_dt_down["f1"].append(f1_score(y_test, y_pred_dt))
    scores_dt_down["roc_auc"].append(roc_auc_score(y_test, y_pred_proba_dt))

    # 랜덤 포레스트 모델
    model_rf.fit(X_train, y_train)
    y_pred_rf = model_rf.predict(X_test)
    y_pred_proba_rf = model_rf.predict_proba(X_test)[:, 1]

    scores_rf_down["accuracy"].append(accuracy_score(y_test, y_pred_rf))
    scores_rf_down["precision"].append(precision_score(y_test, y_pred_rf))
    scores_rf_down["recall"].append(recall_score(y_test, y_pred_rf))
    scores_rf_down["f1"].append(f1_score(y_test, y_pred_rf))
    scores_rf_down["roc_auc"].append(roc_auc_score(y_test, y_pred_proba_rf))


# 평균 점수 계산 및 출력
def print_scores(model_name, scores):
    print(f"\n{model_name} 교차 검증 점수:")
    for metric, values in scores.items():
        print(f"{metric.capitalize()} 평균: {sum(values) / len(values):.4f}")

print_scores("로지스틱 회귀", scores_logi_down)
print_scores("결정 트리", scores_dt_down)
print_scores("랜덤 포레스트", scores_rf_down)


로지스틱 회귀 교차 검증 점수:
Accuracy 평균: 0.9505
Precision 평균: 0.9761
Recall 평균: 0.9237
F1 평균: 0.9450
Roc_auc 평균: 0.9914

결정 트리 교차 검증 점수:
Accuracy 평균: 0.9647
Precision 평균: 0.9911
Recall 평균: 0.9379
F1 평균: 0.9613
Roc_auc 평균: 0.9651

랜덤 포레스트 교차 검증 점수:
Accuracy 평균: 0.9670
Precision 평균: 0.9907
Recall 평균: 0.9429
F1 평균: 0.9642
Roc_auc 평균: 0.9888


##### 결측값 drop

In [52]:
data_drop=data.dropna()
data_drop_index = data_drop.reset_index(drop=True)

In [53]:
data_drop_index

Unnamed: 0,url,label,url_length,character_count,https,ttl,name_server,IP/email,short_url,file_extension,ns_domain_match
0,https://montpelier.org,0,22,3,1,600.0,"['ns01.domaincontrol.com', 'ns02.domaincontrol...",0,0,0,0.0
1,https://notion.com,0,18,3,1,300.0,"['woz.ns.cloudflare.com', 'dana.ns.cloudflare....",0,0,0,0.0
2,https://icanvas.com,0,19,3,1,86400.0,"['pdns01.domaincontrol.com', 'pdns02.domaincon...",0,0,0,0.0
3,https://alloy.com,0,17,3,1,300.0,"['jack.ns.cloudflare.com', 'nola.ns.cloudflare...",0,0,0,0.0
4,https://ynhhs.org,0,17,3,1,300.0,"['ns21.constellix.com', 'ns11.constellix.com',...",0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
170503,https://bitbucket.org/foxxlrep/repo/downloads/...,1,53,8,1,60.0,"['ns-1746.awsdns-26.co.uk', 'ns-1305.awsdns-35...",0,0,1,0.0
170504,https://github.com/delta-io/delta/files/150161...,1,58,10,1,56.0,"['dns1.p08.nsone.net', 'dns2.p08.nsone.net', '...",0,0,1,0.0
170505,https://uploaddeimagens.com.br/images/004/801/...,1,72,11,1,175.0,"['phil.ns.cloudflare.com', 'iris.ns.cloudflare...",0,0,1,0.0
170506,https://mussangroup.com/wp-content/images/pic6...,1,50,8,1,14400.0,"['ns1.veridyen.com', 'ns2.veridyen.com']",0,0,1,0.0


In [54]:
count_1 = data_drop_index['label'].value_counts()[1]
print("label이 1인 값의 개수:", count_1)

label이 1인 값의 개수: 29692


In [57]:
X_drop = data_drop_index[['url_length','character_count','https','ttl', 'ns_domain_match', 'IP/email', 'short_url','file_extension']]
y_drop = data_drop_index['label']

In [59]:
additional_features_drop = X_drop[['ttl']].values
scaler = StandardScaler()
additional_features_drop = scaler.fit_transform(additional_features_drop)
# additional_features를 DataFrame으로 변환 (열 이름 지정)
additional_features_df = pd.DataFrame(additional_features_drop, columns=['ttl_scaled'])

# X와 additional_features_df를 수평으로 결합
X_combined_drop = pd.concat([X_drop, additional_features_df], axis=1)

# 결과 확인
print(X_combined_drop)

        url_length  character_count  https      ttl  ns_domain_match  \
0               22                3      1    600.0              0.0   
1               18                3      1    300.0              0.0   
2               19                3      1  86400.0              0.0   
3               17                3      1    300.0              0.0   
4               17                3      1    300.0              0.0   
...            ...              ...    ...      ...              ...   
170503          53                8      1     60.0              0.0   
170504          58               10      1     56.0              0.0   
170505          72               11      1    175.0              0.0   
170506          50                8      1  14400.0              0.0   
170507          49                8      0   7199.0              0.0   

        IP/email  short_url  file_extension  ttl_scaled  
0              0          0               0   -0.196007  
1              0   

In [60]:
X_combined_drop.drop(columns=['ttl'], inplace=True)

In [61]:
X_combined_drop

Unnamed: 0,url_length,character_count,https,ns_domain_match,IP/email,short_url,file_extension,ttl_scaled
0,22,3,1,0.0,0,0,0,-0.196007
1,18,3,1,0.0,0,0,0,-0.209846
2,19,3,1,0.0,0,0,0,3.761863
3,17,3,1,0.0,0,0,0,-0.209846
4,17,3,1,0.0,0,0,0,-0.209846
...,...,...,...,...,...,...,...,...
170503,53,8,1,0.0,0,0,1,-0.220917
170504,58,10,1,0.0,0,0,1,-0.221101
170505,72,11,1,0.0,0,0,1,-0.215612
170506,50,8,1,0.0,0,0,1,0.440573


In [63]:
y_drop

Unnamed: 0,label
0,0
1,0
2,0
3,0
4,0
...,...
170503,1
170504,1
170505,1
170506,1


In [64]:
X_resampled_drop, y_resampled_drop = undersampler.fit_resample(X_combined_drop, y_drop)

In [65]:
print(f"샘플링 전 클래스 분포: {Counter(y_drop)}")
print(f"샘플링 후 클래스 분포: {Counter(y_resampled_drop)}")

샘플링 전 클래스 분포: Counter({0: 140816, 1: 29692})
샘플링 후 클래스 분포: Counter({0: 29692, 1: 29692})


In [66]:
X_drop_numpy = np.array(X_resampled_drop)  # NumPy 배열로 변환
y_drop_numpy = np.array(y_resampled_drop)

In [67]:
scores_logi_down = {"accuracy": [], "precision": [], "recall": [], "f1": [], "roc_auc": []}
scores_dt_down = {"accuracy": [], "precision": [], "recall": [], "f1": [], "roc_auc": []}
scores_rf_down = {"accuracy": [], "precision": [], "recall": [], "f1": [], "roc_auc": []}

In [68]:
for train_idx, test_idx in skf.split(X_drop_numpy, y_drop_numpy):
    X_train, X_test = X_drop_numpy[train_idx], X_drop_numpy[test_idx]
    y_train, y_test = y_drop_numpy[train_idx], y_drop_numpy[test_idx]

    # 로지스틱 회귀 모델
    model.fit(X_train, y_train)
    y_pred_logi = model.predict(X_test)
    y_pred_proba_logi = model.predict_proba(X_test)[:, 1]  # 확률 예측 (ROC-AUC용)

    scores_logi_down["accuracy"].append(accuracy_score(y_test, y_pred_logi))
    scores_logi_down["precision"].append(precision_score(y_test, y_pred_logi))
    scores_logi_down["recall"].append(recall_score(y_test, y_pred_logi))
    scores_logi_down["f1"].append(f1_score(y_test, y_pred_logi))
    scores_logi_down["roc_auc"].append(roc_auc_score(y_test, y_pred_proba_logi))

    # 결정 트리 모델
    model_dt.fit(X_train, y_train)
    y_pred_dt = model_dt.predict(X_test)
    y_pred_proba_dt = model_dt.predict_proba(X_test)[:, 1]

    scores_dt_down["accuracy"].append(accuracy_score(y_test, y_pred_dt))
    scores_dt_down["precision"].append(precision_score(y_test, y_pred_dt))
    scores_dt_down["recall"].append(recall_score(y_test, y_pred_dt))
    scores_dt_down["f1"].append(f1_score(y_test, y_pred_dt))
    scores_dt_down["roc_auc"].append(roc_auc_score(y_test, y_pred_proba_dt))

    # 랜덤 포레스트 모델
    model_rf.fit(X_train, y_train)
    y_pred_rf = model_rf.predict(X_test)
    y_pred_proba_rf = model_rf.predict_proba(X_test)[:, 1]

    scores_rf_down["accuracy"].append(accuracy_score(y_test, y_pred_rf))
    scores_rf_down["precision"].append(precision_score(y_test, y_pred_rf))
    scores_rf_down["recall"].append(recall_score(y_test, y_pred_rf))
    scores_rf_down["f1"].append(f1_score(y_test, y_pred_rf))
    scores_rf_down["roc_auc"].append(roc_auc_score(y_test, y_pred_proba_rf))


# 평균 점수 계산 및 출력
def print_scores(model_name, scores):
    print(f"\n{model_name} 교차 검증 점수:")
    for metric, values in scores.items():
        print(f"{metric.capitalize()} 평균: {sum(values) / len(values):.4f}")

print_scores("로지스틱 회귀", scores_logi_down)
print_scores("결정 트리", scores_dt_down)
print_scores("랜덤 포레스트", scores_rf_down)


로지스틱 회귀 교차 검증 점수:
Accuracy 평균: 0.9094
Precision 평균: 0.9034
Recall 평균: 0.9180
F1 평균: 0.9087
Roc_auc 평균: 0.9793

결정 트리 교차 검증 점수:
Accuracy 평균: 0.9647
Precision 평균: 0.9853
Recall 평균: 0.9435
F1 평균: 0.9638
Roc_auc 평균: 0.9732

랜덤 포레스트 교차 검증 점수:
Accuracy 평균: 0.9677
Precision 평균: 0.9848
Recall 평균: 0.9501
F1 평균: 0.9670
Roc_auc 평균: 0.9925
