# Валидация моделей

Рассмотри этапы валидации модели на примере задачи обнаружения вредоносных ссылок.

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import precision_recall_fscore_support, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import FunctionTransformer
# from sklearn.compose import make_column_transformer
from sklearn import set_config
set_config(display="diagram")

from scipy.stats  import norm, ttest_ind
from scipy import stats

# from xgboost import XGBClassifier

import shap

import warnings
warnings.simplefilter(action='ignore', category=(FutureWarning, UserWarning))

import matplotlib.pyplot as plt
import seaborn as sns

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


In [2]:
# malicious_urls_dataset_url = "https://github.com/faizann24/Using-machine-learning-to-detect-malicious-URLs/raw/master/data/data.csv"

# ds = pd.read_csv(malicious_urls_dataset_url)
ds = pd.read_csv('data.csv')
ds.head(10)

Unnamed: 0,url,label
0,diaryofagameaddict.com,bad
1,espdesign.com.au,bad
2,iamagameaddict.com,bad
3,kalantzis.net,bad
4,slightlyoffcenter.net,bad
5,toddscarwash.com,bad
6,tubemoviez.com,bad
7,ipl.hk,bad
8,crackspider.us/toolbar/install.php?pack=exe,bad
9,pos-kupang.com/,bad


In [3]:

ds['label'].value_counts()

label
good    344821
bad      75643
Name: count, dtype: int64

In [4]:
len(ds)

420464

In [5]:
ds_sample = ds.sample(frac=0.1, random_state=1)

In [6]:
ds_sample["label_enc"] = ds_sample.label.map({"bad": 1, "good": 0})

In [7]:
ds_sample.head(4)

Unnamed: 0,url,label,label_enc
155439,twitter.com/aQuariusrecOrds,good,0
152415,tong464.org/,good,0
276521,legacy.com/obituaries/orlandosentinel/obituary...,good,0
305692,natcath.org/ncr_onli.htm,good,0


In [8]:
feature_cols = "url"


target_col = "label_enc"

X = ds_sample[feature_cols]
y = ds_sample[target_col].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [9]:
url_vectorizer = CountVectorizer(
    analyzer="char",
    ngram_range=(1, 1),
    # max_df=3,
    # min_df=1,
    # max_features=None,
)

rf_clf = RandomForestClassifier(
    n_estimators=35,
    max_depth=9,
    random_state=42,
    n_jobs=1,
)

pipe = Pipeline(steps=[
    ("vectorizer", url_vectorizer),
    # ("todense", FunctionTransformer(lambda x: x.toarray(), accept_sparse=True)),
    ("classifier", rf_clf),
])
pipe

In [10]:
pipe.fit(X_train, y_train);
y_pred = pipe.predict(X_test)

P, R, F1, _ = precision_recall_fscore_support(y_test, y_pred, average="binary")
roc_auc = roc_auc_score(y_test, y_pred)
print(f"P: {P:.2f}, R: {R:.2f}, F1: {F1:.5f}, AUC: {roc_auc:.5f}")

P: 0.95, R: 0.21, F1: 0.34645, AUC: 0.60471


## Упражнение 1

С помощью bootstrap постройте гистограмму распределения оценки $F_1$ на контрольной выборке.

Постройте теоретическое нормальное распределение на этой гистограмме.

Покажите среднее значение и стандартное отклонение.

In [None]:
np.random.seed(42)

bootstrap_iterations = 100

df = pd.DataFrame(data={
    "y_test": y_test,
    "y_pred": y_pred,
})

scores = pd.DataFrame(data={
    "F1": 0.0, "P": 0.0, "R": 0.0, "AUC": 0.0
}, index=range(bootstrap_iterations))

# Your code here

In [None]:
f1_mean = scores["F1"].mean()
f1_std = scores["F1"].std()


f1_low = f1_mean - 3 * f1_std
f1_upp = f1_mean + 3 * f1_std


# Your code here

## Подбор гипрепараметров

In [None]:
params_grid = {
    "classifier__n_estimators": range(50, 70, 10),
    "classifier__max_depth": range(15, 20),
}

random_search_res = RandomizedSearchCV(
    pipe,
    params_grid,
    n_iter=100,
    cv=5,
    scoring="f1",
    n_jobs=-1,
    verbose=1,
    refit=True,
    random_state=42,
).fit(X_train, y_train)

In [None]:
random_search_res.best_params_

In [None]:
y_pred_candidate = random_search_res.predict(X_test)
P_candidate, R_candidate, F1_candidate, _ = precision_recall_fscore_support(y_test, y_pred_candidate, average="binary")
print(f"P_candidate: {P_candidate:.2f}, R_candidate: {R_candidate:.2f}, F1_candidate: {F1_candidate:.5f}")

## Упражнение 2

1. Постройте с помощью booststrap выборку оценки $F_1$ для лучшей модели после подбора гиперпарамтеров.
2. Сравните с помощью t-теста базовую модель и кандидата. Примите $\alpha = 0.01$
3. Попробуйте расширить пространство гипрепараметров, чтобы поиск выдал лучший результат


In [None]:
df_cand = pd.DataFrame(data={
    "y_test": y_test,
    "y_pred": y_pred_candidate,
})

scores_cand = pd.DataFrame(data={
    "F1": 0.0, "P": 0.0, "R": 0.0, "AUC": 0.0
}, index=range(bootstrap_iterations))

# Your code here

In [None]:
# Your code here


## Интерпретация предсказаний

Рассмотрим результат анализа с помощью SHAP

In [None]:
df[(df["y_test"] == 1) & (df["y_pred"] == 0)].head()

In [None]:
explainer = shap.TreeExplainer(
    pipe.named_steps["classifier"],
    feature_names=url_vectorizer.get_feature_names_out(),
)

test_object = X_test.iloc[10]
print(test_object)
test_object_vect = url_vectorizer.transform([test_object]).toarray()
shap_values = explainer.shap_values(test_object_vect)
shap.initjs()
shap.force_plot(explainer.expected_value[1], shap_values[1], test_object_vect, feature_names=url_vectorizer.get_feature_names_out())

## Упражнение 3.

Проведите с помощью SHAP анализ ошибок модели обоих типов и дайте предложения по улучшению модели.

In [None]:
# Your code here