# Source Code Analysis

## Initialization

In [1]:
import os
import sys

def add_path(path):
    if path not in sys.path:
        sys.path.insert(0, path)
        sys.path.append(path)
add_path('/home/jjian03/anaconda3/lib/python3.7/site-packages')
add_path('/home/jjian03/iconference_followup_study')


### Load Data

In [2]:
import time
import datetime
import pandas as pd

seed = 77

data_file = '../data/untrunc_data_cleaned_url.csv'

raw_data = pd.read_table(data_file, sep=',', index_col=0)
raw_data = raw_data.dropna()

raw_data.info()

print(f'raw_data: {raw_data.shape}')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 58871 entries, 0 to 58909
Data columns (total 42 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   protocol_type                           58871 non-null  float64
 1   has_www                                 58871 non-null  float64
 2   has_iframe                              58871 non-null  float64
 3   int                                     58871 non-null  float64
 4   org                                     58871 non-null  float64
 5   gov                                     58871 non-null  float64
 6   in                                      58871 non-null  float64
 7   eu                                      58871 non-null  float64
 8   cn                                      58871 non-null  float64
 9   kr                                      58871 non-null  float64
 10  url_depth                               58871 non-null  fl

### Train Test Split

In [3]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing


x = raw_data.drop(['label', 'first_appear', 'url'], axis=1)
y = raw_data.label
y = preprocessing.StandardScaler().fit_transform(y.values.reshape(-1, 1))
y = pd.DataFrame(y).iloc[:,0]


X_train, X_test, y_train, y_test = train_test_split(
    x, y, test_size = 0.33, random_state=seed)

In [4]:
import gc
import multiprocessing

import warnings
warnings.filterwarnings("ignore")


cpu_cnt = multiprocessing.cpu_count()
allocated_cpu = cpu_cnt/2
print(f"Allocated {allocated_cpu} CPUs")
gc.collect()

Allocated 8.0 CPUs


20

In [5]:
import numpy as np

from sklearn import preprocessing, clone
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.utils.validation import _check_fit_params


class VerboseGridSearchCV(GridSearchCV):

    def __init__(self, estimator, param_grid, *, scoring=None,
                 n_jobs=None, iid='deprecated', refit=True, cv=None,
                 verbose=0, pre_dispatch='2*n_jobs',
                 error_score=np.nan, return_train_score=False):
        super().__init__(
            estimator=estimator, param_grid=param_grid, scoring=scoring,
            n_jobs=n_jobs, iid=iid, refit=refit, cv=cv, verbose=verbose,
            pre_dispatch=pre_dispatch, error_score=error_score,
            return_train_score=return_train_score)
        self.estimators_ = list()
        self.params_ = list()

    def fit(self, X, y=None, *, groups=None, **fit_params):
        super(VerboseGridSearchCV, self).fit(X, y=y, groups=groups, **fit_params)
        if not self.refit:
            return self
        results = self.cv_results_

        fit_params = _check_fit_params(X, fit_params)
        for idx in range(len(results["params"])):
            params = results["params"][idx]
            estimator = clone(clone(clone(self.estimator)).set_params(**params))
            if y is not None:
                estimator.fit(X, y, **fit_params)
            else:
                estimator.fit(X, **fit_params)

            self.estimators_.append(estimator)
            self.params_.append(params)

        grid.params_ = pd.DataFrame.from_dict(grid.params_)
        return self

#### L-BFGS-B

In [6]:
param_rf = {
    'min_samples_leaf': [232],
    'min_samples_split': [490],
    'n_estimators': [30],
    'max_depth': [11],

}

rf = RandomForestRegressor(
    criterion="mse",
    verbose=False,
    bootstrap=True,
    random_state=seed,
    warm_start=True,
    oob_score = True,
    min_weight_fraction_leaf = 0.,
    max_features='sqrt',

    max_leaf_nodes=None,
    min_impurity_decrease=0.,
    max_samples=None,
    ccp_alpha=0.0,
)

grid = VerboseGridSearchCV(
    n_jobs=allocated_cpu,
    cv=5,
    estimator=rf,
    param_grid=param_rf,
)
grid.fit(X_train, y_train)


ValueError: Stop argument for islice() must be None or an integer: 0 <= x <= sys.maxsize.

In [None]:
def get_pseudo_r2(y_true, y_hat):
    correlation_matrix = np.corrcoef(
        y_true.astype(float), 
        y_hat.astype(float)
    )
    correlation_xy = correlation_matrix[0,1]
    return correlation_xy**2


y_hat_train = grid.best_estimator_.predict(X_train)
print(f'R Square on Training set: {get_pseudo_r2(y_train, y_hat_train)}')
y_hat_test = grid.best_estimator_.predict(X_test)
print(f'R Square on Testing set:  {get_pseudo_r2(y_test, y_hat_test)}')


In [None]:
feature_names = [*X_train.columns]

# Get the model without any penalty term
coef = grid.best_estimator_.feature_importances_
plot_feature_importance(
    coef, feature_names, 
    "Coefficients of the Random Forest")

In [None]:
import shap


shap.initjs()

explainer = shap.TreeExplainer(grid.best_estimator_)
shap_values = explainer.shap_values(X_test)

# visualize the first prediction's explanation (use matplotlib=True to avoid Javascript)
shap.force_plot(explainer.expected_value, shap_values[0,:], X_test.iloc[0,:])

In [None]:
# shap.force_plot(explainer.expected_value, shap_values, X_test)

In [None]:
shap.summary_plot(shap_values, X_test)

In [None]:
shap.summary_plot(shap_values, X_test, plot_type="bar")