# Импорт библиотек и загрузка данных

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_validate
from sklearn.metrics import f1_score
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/rushawx/stepik-eda-and-dev/main/online_shoppers_intention.csv')

In [3]:
median_informational_duration = df['Informational_Duration'].median()
df['Informational_Duration'].fillna(median_informational_duration, inplace=True)

median_product_related_duration = df['ProductRelated_Duration'].median()
df['ProductRelated_Duration'].fillna(median_product_related_duration, inplace=True)

median_exit_rates = df['ExitRates'].median()
df['ExitRates'].fillna(median_exit_rates, inplace=True)

In [4]:
df.columns

Index(['Administrative', 'Administrative_Duration', 'Informational',
       'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration',
       'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay', 'Month',
       'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'VisitorType',
       'Weekend', 'Revenue'],
      dtype='object')

In [5]:
categorical = [
    'Month', 'OperatingSystems', 'Browser', 'Region', 
    'TrafficType', 'VisitorType'
]
numerical = [
    'Administrative', 'Administrative_Duration', 'Informational', 'Informational_Duration', 
    'ProductRelated', 'ProductRelated_Duration', 'BounceRates', 'ExitRates', 
    'PageValues', 'SpecialDay', 'Weekend'
]

In [6]:
for var in categorical:
    df[var] = df[var].astype('category')

In [7]:
df['Revenue'] = df['Revenue'].astype('int64')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype   
---  ------                   --------------  -----   
 0   Administrative           12330 non-null  int64   
 1   Administrative_Duration  12330 non-null  float64 
 2   Informational            12330 non-null  int64   
 3   Informational_Duration   12330 non-null  float64 
 4   ProductRelated           12330 non-null  int64   
 5   ProductRelated_Duration  12330 non-null  float64 
 6   BounceRates              12330 non-null  float64 
 7   ExitRates                12330 non-null  float64 
 8   PageValues               12330 non-null  float64 
 9   SpecialDay               12330 non-null  float64 
 10  Month                    12330 non-null  category
 11  OperatingSystems         12330 non-null  category
 12  Browser                  12330 non-null  category
 13  Region                   12330 non-null  category
 14  Traffi

In [9]:
df.isna().sum()

Administrative             0
Administrative_Duration    0
Informational              0
Informational_Duration     0
ProductRelated             0
ProductRelated_Duration    0
BounceRates                0
ExitRates                  0
PageValues                 0
SpecialDay                 0
Month                      0
OperatingSystems           0
Browser                    0
Region                     0
TrafficType                0
VisitorType                0
Weekend                    0
Revenue                    0
dtype: int64

In [10]:
df['Revenue'].value_counts(normalize=True)

0    0.845255
1    0.154745
Name: Revenue, dtype: float64

# 1. Построение моделей на числовых признаках с параметрами по умолчанию

In [11]:
y = df['Revenue']
X_num = df[numerical]

In [12]:
X_num_train, X_num_test, y_train, y_test = train_test_split(X_num, y, test_size=0.25, random_state=42)

In [13]:
mm_scaler = MinMaxScaler()

mm_scaler.fit(X_num_train)

X_num_train = mm_scaler.transform(X_num_train)
X_num_test = mm_scaler.transform(X_num_test)

In [14]:
gnb_num = GaussianNB()

cross_validate(gnb_num, X_num_train, y_train, scoring=['accuracy', 'precision', 'recall', 'f1'], cv=3, n_jobs=-1)

{'fit_time': array([0.00400066, 0.00400972, 0.00399184]),
 'score_time': array([0.00899982, 0.0109911 , 0.0089972 ]),
 'test_accuracy': array([0.81122283, 0.83290071, 0.80726801]),
 'test_precision': array([0.42625169, 0.46956522, 0.41791045]),
 'test_recall': array([0.66596195, 0.68498943, 0.65116279]),
 'test_f1': array([0.51980198, 0.55717971, 0.50909091])}

In [15]:
knn_num = KNeighborsClassifier()

cross_validate(knn_num, X_num_train, y_train, scoring=['accuracy', 'precision', 'recall', 'f1'], cv=3, n_jobs=-1)

{'fit_time': array([0.02399707, 0.01899958, 0.02400255]),
 'score_time': array([0.21899986, 0.22299886, 0.23599863]),
 'test_accuracy': array([0.87966267, 0.8835172 , 0.87735237]),
 'test_precision': array([0.6722973 , 0.69127517, 0.64705882]),
 'test_recall': array([0.42071882, 0.43551797, 0.44186047]),
 'test_f1': array([0.51755527, 0.53437095, 0.52512563])}

In [16]:
print(f"NB: {np.round(cross_validate(gnb_num, X_num_train, y_train, scoring=['accuracy', 'precision', 'recall', 'f1'], cv=3, n_jobs=-1)['test_f1'].mean(), 4)}; KNN: {np.round(cross_validate(knn_num, X_num_train, y_train, scoring=['accuracy', 'precision', 'recall', 'f1'], cv=3, n_jobs=-1)['test_f1'].mean(), 4)}")

NB: 0.5287; KNN: 0.5257


Модель демонстрирует высокую точность. Однако это частично обусловлено дисбалансом классов в целевой переменной. Так как нет понимания, какой класс приоритетнее, то далее в качестве основной метрики для сравненния будет использоваться f1_score.

По выбранной метрике, f1_score, наилучшие результаты демонстрирует модель - GNB.

# 2. Подбор гиперпараметров у каждой из моделей (только на числовых признаках) при помощи GridSearchCV

In [17]:
knn_num = KNeighborsClassifier()

params_knn = {'n_neighbors' : np.arange(2, 20, 2),
          'weights' : ['uniform', 'distance'],
          'p' : [1, 2]}

gs = GridSearchCV(knn_num, params_knn, scoring='f1', cv=3, n_jobs=-1, verbose=2)
gs.fit(X_num_train, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


In [18]:
gs.best_score_, gs.best_params_

(0.5295105939240873, {'n_neighbors': 4, 'p': 2, 'weights': 'distance'})

In [19]:
pred = gs.best_estimator_.predict(X_num_test)

np.round(f1_score(y_test, pred), 4)

0.5257

У наивного байесовского классификатора отсутствуют параметры, поэтому подбор гиперпараметров для этой модели не проводился.

Модель KNN не улучшила свое значение метрики благодаря подбору гиперпараметров.

# 3. Добавление категориальных признаков в лучшую модель, обучение модели и заново подбор ее гиперпараметров

По результатам построения моделей на числовых признаках далее используется модель KNN

In [20]:
X_full = df.drop('Revenue', axis=1)

In [21]:
X_full_train, X_full_test, y_train, y_test = train_test_split(X_full, y, test_size=0.25, random_state=42)

In [22]:
ct = ColumnTransformer([
    ('ohe', OneHotEncoder(handle_unknown="ignore"), categorical),
    ('scaling', MinMaxScaler(), numerical)
])

ct.fit(X_full_train)

X_train_transformed = ct.transform(X_full_train)
X_test_transformed = ct.transform(X_full_test)

In [23]:
new_features = list(ct.named_transformers_['ohe'].get_feature_names_out())
new_features.extend(numerical)

In [24]:
X_train_transformed = pd.DataFrame(X_train_transformed.toarray(), columns=new_features)
X_test_transformed = pd.DataFrame(X_test_transformed.toarray(), columns=new_features)

In [25]:
X_train_transformed.head()

Unnamed: 0,Month_Aug,Month_Dec,Month_Feb,Month_Jul,Month_June,Month_Mar,Month_May,Month_Nov,Month_Oct,Month_Sep,...,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Weekend
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.045311,0.125,0.097344,0.048227,0.013857,0.0,0.025641,0.215062,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.005674,0.000838,0.0,0.25,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.002837,0.001891,0.0,0.25,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.083333,0.008237,0.024113,0.002981,0.111111,0.196296,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.015603,0.00229,0.090909,0.090909,0.0,0.0,0.0


In [26]:
knn_full = KNeighborsClassifier()

params_knn = {'n_neighbors' : np.arange(2, 20, 2),
          'weights' : ['uniform', 'distance'],
         'p' : [1, 2]}

gs = GridSearchCV(knn_full, params_knn, scoring='f1', cv=3, n_jobs=-1, verbose=5)
gs.fit(X_train_transformed, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


        nan 0.26077644        nan 0.21830001        nan 0.23612011
        nan 0.19661907        nan 0.19656702        nan 0.17814027
        nan 0.18843626        nan 0.14914025        nan 0.1648784
        nan 0.12787297        nan 0.14864776        nan 0.10882665
        nan 0.13194542        nan 0.09792783        nan 0.12067528]


In [27]:
gs.best_score_, gs.best_params_

(0.2899011834150553, {'n_neighbors': 2, 'p': 2, 'weights': 'distance'})

In [28]:
pred = gs.best_estimator_.predict(X_test_transformed)

f1_score(y_test, pred)

0.30133928571428575

Включение в модель категориальных признаков негативно сказалось на метрике качества, f1_score.

# 4. Построение Explainer Dashboard

In [29]:
!pip install explainerdashboard -q

In [30]:
from explainerdashboard import ClassifierExplainer, ExplainerDashboard

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


In [31]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [32]:
explainer = ClassifierExplainer(gs.best_estimator_, X_test_transformed.iloc[:50], y_test.iloc[:50])

Note: shap values for shap='kernel' normally get calculated against X_background, but paramater X_background=None, so setting X_background=shap.sample(X, 50)...
Generating self.shap_explainer = shap.KernelExplainer(model, X, link='identity')


In [33]:
db = ExplainerDashboard(explainer)

Building ExplainerDashboard..
Detected notebook environment, consider setting mode='external', mode='inline' or mode='jupyterlab' to keep the notebook interactive while the dashboard is running...
For this type of model and model_output interactions don't work, so setting shap_interaction=False...
The explainer object has no decision_trees property. so setting decision_trees=False...
Generating layout...
Calculating shap values...


  0%|          | 0/50 [00:00<?, ?it/s]

Calculating prediction probabilities...
Calculating metrics...
Calculating confusion matrices...
Calculating classification_dfs...
Calculating roc auc curves...
Calculating pr auc curves...
Calculating liftcurve_dfs...
Calculating dependencies...
Calculating permutation importances (if slow, try setting n_jobs parameter)...
Calculating predictions...
Calculating pred_percentiles...
Reminder: you can store the explainer (including calculated dependencies) with explainer.dump('explainer.joblib') and reload with e.g. ClassifierExplainer.from_file('explainer.joblib')
Registering callbacks...


In [34]:
db.run()

Starting ExplainerDashboard on http://10.102.37.150:8050


ConnectionError: HTTPConnectionPool(host='0.0.0.0', port=8050): Max retries exceeded with url: /_alive_d0824d09-a9ad-4f93-8737-172ecf6565e4 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x0000018A4F720D00>: Failed to establish a new connection: [WinError 10049] The requested address is not valid in its context'))

Empty DataFrame
Columns: [col, contribution, value]
Index: []
Empty DataFrame
Columns: [col, contribution, value]
Index: []
Empty DataFrame
Columns: [col, contribution, value]
Index: []Empty DataFrame
Columns: [col, contribution, value]
Index: []

Empty DataFrame
Columns: [col, contribution, value]
Index: []Empty DataFrame
Columns: [col, contribution, value]
Index: []

Empty DataFrame
Columns: [col, contribution, value]
Index: []
Empty DataFrame
Columns: [col, contribution, value]
Index: []
Empty DataFrame
Columns: [col, contribution, value]
Index: []
Empty DataFrame
Columns: [col, contribution, value]
Index: []
Empty DataFrame
Columns: [col, contribution, value]
Index: []
Empty DataFrame
Columns: [col, contribution, value]
Index: []
Empty DataFrame
Columns: [col, contribution, value]
Index: []Empty DataFrame
Columns: [col, contribution, value]
Index: []

Empty DataFrame
Columns: [col, contribution, value]
Index: []
Empty DataFrame
Columns: [col, contribution, value]
Index: []
Empty Da

# 5. Анализ модели в Explainer Dashboard

Какие факторы наиболее важны в среднем для получения прогноза?
<li> PageValues
<li> Administrative
<li> Month_Nov

Какие значения метрик получились и что это значит?
<li> accuracy	0.86
<li> precision	0.40
<li> recall	0.33
<li> f1	0.36
<li> roc_auc_score	0.69
<li> pr_auc_score	0.48
<li> log_loss	2.32

Анализ индивидуальных прогнозов с комментарием:
<li> Индекс 1. Не конвертируется в покупку. В основном так как это не выходной день, месяц - март, тип трафика - 2, недостаточное количество посещений страниц продукта, регион - 3;
<li> Индекс 22. Покупка с вероятностью 63%. В основном так как это выходной день, месяц - ноябрь, умеренное количество посещенных страниц Administrative, тип трафика - 2, низкий Exit Rate, регион - 4.