# Source Code Analysis

## Initialization

In [1]:
import sys
sys.path.insert(0, '/home/jjian03/anaconda3/lib/python3.7/site-packages')

### Load Data

In [2]:
import time
import datetime
import sys
import base64
import pandas as pd
from singleton_decorator import singleton

import findspark
findspark.init('/opt/cloudera/parcels/SPARK2/lib/spark2/')
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql import functions as fn
from pyspark.sql.types import *

from sklearn.utils import random
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, ShuffleSplit


seed=77

def shape(df):
    return (len(df), len(df.columns))


@singleton
class DataSource:
#     def __init__(self, fract=.002, training_rate=.7, seed=77):
    def __init__(self, fract=.003, training_rate=.7, seed=77):
        self._spark = SparkSession.builder. \
            config('spark.app.name', 'logistic_regression'). \
            config('spark.driver.memory', '20g').\
            config('spark.network.timeout', '600s').\
            config('spark.driver.maxResultSize', '60g').\
            config('spark.executor.memory', '60g').\
            config('spark.kryoserializer.buffer.max', '1536m').\
            config('spark.cores.max', '50').\
            getOrCreate()
        self._sc = self._spark.sparkContext
        self._spark_sql = SQLContext(self._sc)
        print(self._spark.version)
        self._label_name = 'label'
        self.re_initialize(fract, training_rate, seed)
        print('Initialized')

    def load_dataset(self, path, name):
        return self._spark.read.parquet(path).registerTempTable(name)

    @property
    def sparkContext(self):
        return self._sc

    @property
    def sparkSQL(self):
        return self._spark_sql

    def re_initialize(self, fract, training_rate, seed):

        self.load_dataset('/user/jjian03/WebResourceQuality.parquet', 'web_resource_quality')
        self.load_dataset('/user/jjian03/WebResourceQuality_pmid.parquet', 'web_resource_quality_pmid')
        self.load_dataset('/datasets/MAG_20200403/MAG_Azure_Parquet/mag_parquet/Papers.parquet', 'Paper')
        self.load_dataset('/user/lliang06/icon/MAG_publication_features.parquet', 'mag')

        self._raw_data = self._spark_sql.sql('''
            SELECT wr.id
                , wr.url
                , wr.actual_scrape_url
                , wr.first_appear
                , wr.first_available_timestamp
                , wr.last_available_timestamp
                , wr.header
                , wr.html_text
                , wr.comment
                , wr.from_waybackmachine
                , wr.http_status_code
                , wr.original_check_failure
                , wr.original_check_error_log
                , wr.terminate_reason
                , wr.terminate_reason_error_log
                , wr.label

                , m.paperId
                , m.total_num_of_paper_citing
                , m.total_num_of_author_citing
                , m.total_num_of_affiliation_citing
                , m.total_num_of_journal_citing
                , m.total_num_of_author_self_citation
                , m.total_num_of_affiliation_self_citation
                , m.total_num_of_journal_self_citation
                , m.avg_year
                , m.min_year
                , m.max_year
                , m.median
                , m.num_of_author
                , m.num_of_author_citing
                , m.num_of_affiliation_citing
                , m.num_of_journal_citing
                , m.avg_hindex
                , m.first_author_hindex
                , m.last_author_hindex
                , m.avg_mid_author_hindex
                , m.paper_unique_affiliation

            FROM web_resource_quality wr
            JOIN web_resource_quality_pmid wr_doi ON wr.id = wr_doi.id
            JOIN Paper p ON wr_doi.doi = p.doi
            JOIN mag m ON p.paperId = m.paperId
            WHERE wr.label IS NOT NULL
            AND wr.label IN ("0", "1")
            AND wr.first_appear IS NOT NULL
        ''') \
        .orderBy(fn.rand()) \
        .sample(False, fract, seed) \
        .toPandas()

        print(f'Sample Size - raw_data: {len(self._raw_data)}')

        self._train_data, self._test_data = train_test_split(self._raw_data, test_size=1-training_rate, random_state=seed)

        self._X_raw = self._raw_data.drop(self._label_name, axis=1)
        self._y_raw = self._raw_data[self._label_name]

        self._X_train = self._train_data.drop(self._label_name, axis=1)
        self._y_train = self._train_data[self._label_name]

        self._X_test = self._test_data.drop(self._label_name, axis=1)
        self._y_test = self._test_data[self._label_name]

    @property
    def raw_data(self):
        return self._raw_data

    @property
    def X_raw(self):
        return self._X_raw

    @property
    def y_raw(self):
        return self._y_raw

    @property
    def X_train(self):
        return self._X_train

    @property
    def y_train(self):
        return self._y_train

    @property
    def X_test(self):
        return self._X_test

    @property
    def y_test(self):
        return self._y_test

In [3]:
start_time = time.time()
raw_data = DataSource().raw_data

raw_data.info()

print(f'raw_data: {shape(raw_data)}')

t = str(datetime.timedelta(seconds=time.time() - start_time)).split(':')
print("--- %s minutes, %.2f seconds ---" % (t[1], float(t[2])))

2.4.0.cloudera2


ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/opt/cloudera/parcels/SPARK2/lib/spark2/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1152, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/home/hzhuang/anaconda3/lib/python3.7/socket.py", line 589, in readinto
    return self._sock.recv_into(b)
ConnectionResetError: [Errno 104] Connection reset by peer

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/cloudera/parcels/SPARK2/lib/spark2/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 985, in send_command
    response = connection.send_command(command)
  File "/opt/cloudera/parcels/SPARK2/lib/spark2/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1164, in send_command
    "Error while receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while receiving
ERROR:py4j.java_gateway:An error occurred w

Py4JNetworkError: An error occurred while trying to connect to the Java server (127.0.0.1:41797)

### Train Test Split

In [None]:
start_time = time.time()
X_raw = DataSource().X_raw
y_raw = DataSource().y_raw

X_train = DataSource().X_train
y_train = DataSource().y_train

X_test = DataSource().X_test
y_test = DataSource().y_test

print('Shape of the dataframe:')
print(f'X_raw: {shape(X_raw)}')
print(f'y_raw: {len(y_raw)}')
print()
print(f'X_train: {shape(X_train)}')
print(f'y_train: {len(y_train)}')
print()
print(f'X_test: {shape(X_test)}')
print(f'y_test: {len(y_test)}')

t = str(datetime.timedelta(seconds=time.time() - start_time)).split(':')
print("--- %s minutes, %.2f seconds ---" % (t[1], float(t[2])))

## Feature Engineering - First Round

In [None]:
result = X_train
result.loc[:,'label'] = y_train

### Features in URL

#### Length of the url hierarchy

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator,TransformerMixin
pd.options.mode.chained_assignment = None


class URLLengthCounter(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self,x,y=None):
        return self

    def transform(self,x,y=None):
        result = x
        result.loc[:,'url_length'] = result['url'].apply(self._get_length)
        return result

    def _get_length(self, url):
        return len(url)


pipe = Pipeline([
    ('url_length_counter', URLLengthCounter()),
])

result = pipe.transform(result)

display(result[['url', 'url_length']].head(5))

#### Parse URL

In [None]:
from urllib.parse import urlparse


class URLParser(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._lambdas = dict()

    def fit(self,x,y=None):
        return self

    def transform(self,x,y=None):
        result = x
        result.loc[:,'url_parse_obj'] = result['url'].apply(urlparse)

        result.loc[:,'scheme'] = result.url_parse_obj.apply(lambda x: x.scheme)
        result.loc[:,'netloc'] = result.url_parse_obj.apply(lambda x: x.netloc)
        result.loc[:,'path'] = result.url_parse_obj.apply(lambda x: x.path)
        result.loc[:,'params'] = result.url_parse_obj.apply(lambda x: x.query).apply(lambda x: None if '' == x.strip() else x)

        result = result.drop(['url_parse_obj'], axis=1)
        return result

    def register_new_column(self, col_name, lbd):
        self._lambdas[col_name] = lbd
        return self

pipe = Pipeline([
    ('url_parser', URLParser()),
])

result = pipe.transform(result)

display(result[['path', 'scheme', 'netloc', 'path', 'params']][result.params.isna().apply(lambda x: not x)].head(5))

#### Depth of the url hierarchy

In [None]:
class URLDepthCounter(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self,x,y=None):
        return self

    def transform(self,x,y=None):
        result = x
        result.loc[:,'url_depth'] = result['path'].apply(self._get_depth)
        return result

    def _get_depth(self, path):
        last_idx = path.rindex('/')
        if last_idx + 1 < len(path):
            last_idx = len(path)
        return path[:last_idx].count('/')

pipe = Pipeline([
    ('url_depth_counter', URLDepthCounter()),
])

result = pipe.transform(result)

display(result[['path', 'url_depth']].head(5))

#### Has WWW subdomain 

In [None]:
class HasWWWConverter(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self,x,y=None):
        return self

    def transform(self,x,y=None):
        result = x
        result.loc[:, 'has_www'] = result['netloc'].apply(self._has_www)
        return result

    def _has_www(self, domain):
        return int(domain.startswith('www.'))


pipe = Pipeline([
    ('has_www_converter', HasWWWConverter()),
])

result = pipe.transform(result)

display(result[['netloc', 'has_www']].head(5))

#### Level of the Subdomain

In [None]:
class SubdomainLevelCounter(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self,x,y=None):
        return self

    def transform(self,x,y=None):
        result = x
        result.loc[:, 'subdomain_level'] = result['netloc'].apply(self._get_level)
        return result

    def _get_level(self, domain):
        return domain.count('.')


pipe = Pipeline([
    ('subdomain_level_counter', SubdomainLevelCounter()),
])

result = pipe.transform(result)

display(result[['netloc', 'subdomain_level']].head(5))

#### Number of HTTP-Get parameters

In [None]:
import numpy as np


class RequestParameterCounter(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self,x,y=None):
        return self

    def transform(self,x,y=None):
        result = x
        result['params'] = result['params'].replace(np.nan, '', regex=True)
        result.loc[:, 'param_cnt'] = result['params'].apply(self._count_param)
        return result

    def _count_param(self, params):
        if params is '':
            return 0
        return params.count('&') + 1

pipe = Pipeline([
    ('request_parameter_counter', RequestParameterCounter()),
])

result = pipe.transform(result)

display(result[['params', 'param_cnt']].head(5))

#### Domain Suffix

In [None]:
!/home/jjian03/anaconda3/bin/pip install feature_engine

In [None]:
from feature_engine import categorical_encoders


class DomainSuffixBuilder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._suffix_dict = None

    def fit(self,x,y=None):
        new_features = self.build_suffix_port_feature(x)
        new_features = new_features.dropna()

        encoder = categorical_encoders.CountFrequencyCategoricalEncoder(
            encoding_method='frequency',
            variables=['suffix'])
        encoder.fit(new_features)
        self._suffix_dict = encoder.encoder_dict_['suffix']
        return self

    def transform(self,x,y=None):
        result = x
        new_features = self.build_suffix_port_feature(x)
        for col_name in new_features.columns:
            result.loc[:, col_name] = new_features[col_name]
        result.loc[:, 'suffix'] = result.suffix.apply(lambda v: self._suffix_dict[v] if v in self._suffix_dict else 0)

        result = result.dropna(subset=['is_port_access', 'suffix', 'suffix_idx'])
        return result

    def build_suffix_port_feature(self,x):
        result = x
        # Remove incorrect urls
#         result = result[result['netloc'].apply(lambda val: '.' in val)]
        # Build features
        suffix = result.netloc.apply(DomainSuffixBuilder._get_url_suffix)
        is_port_access = suffix.apply(DomainSuffixBuilder._is_port_access)
        suffix_idx = suffix.apply(DomainSuffixBuilder._clean_url_suffix)
        
        return pd.DataFrame({'suffix': suffix, 'suffix_idx': suffix_idx, 'is_port_access': is_port_access, })

    @property
    def suffix_dict(self):
        return self._suffix_dict

    @staticmethod
    def _get_url_suffix(url):
        if not '.' in url:
            return None
        last_idx = url.rindex('.')
        return url[last_idx + 1:]

    @staticmethod
    def _clean_url_suffix(url):
        if None is url:
            return None
        return url.split(':')[0]

    @staticmethod
    def _is_port_access(suffix):
        if None is suffix:
            return None
        return int(len([token for token in suffix.split(':') if token.strip() != '']) > 1)


pipe = Pipeline([
    ('domain_suffix_builder', DomainSuffixBuilder()),
])

result = pipe.fit_transform(result)


display(result[['netloc', 'is_port_access', 'suffix', 'suffix_idx']].head(5))
display(pd.Series(list(pipe.steps[-1][1].suffix_dict.values()), index = pipe.steps[-1][1].suffix_dict.keys()) \
        .head().sort_values(ascending=False))


In [None]:
result.suffix_idx.isna().sum()

#### Remove the Incorrect Domains

- TLD ranges from 2 to 63

Ref: https://en.wikipedia.org/wiki/Domain_Name_System#cite_ref-rfc1034_1-2

In [None]:
import re


class IncorrectDomainUrlCleaner(BaseEstimator, TransformerMixin):
    def __init__(self):
        # TLD ranges from 2 to 63
        self._regex = re.compile(r'^[a-zA-Z]{2,63}$', re.I)

    def fit(self,x,y=None):
        return self
        
    def transform(self,x,y=None):
        result = x
        result.loc[:, 'is_correct'] = result.suffix_idx.apply(self._is_correct)
        result = result[result.is_correct]
        result = result.drop('is_correct', axis=1)
        return result

    def _is_correct(self, domain_suffix):
        return True if self._regex.match(domain_suffix) else False


pipe = Pipeline([
    ('incorrect_domain_url_cleaner', IncorrectDomainUrlCleaner()),
])

result = pipe.transform(result)

print(f'Before changes: {len(X_train)}')
print(f'After changes: {len(result)}')


#### Protocol Type Conversion

In [None]:
from feature_engine import categorical_encoders


class ColumnRenamer(BaseEstimator, TransformerMixin):
    def __init__(self, mapping):
        self._mapping = mapping

    @property
    def mapping(self):
        return self._mapping

    def fit(self,x,y=None):
        return self
        
    def transform(self,x,y=None):
        result = x
        self._mapping = {key: value for key, value in self._mapping.items() if key in result.columns}
        result = result.rename(columns=self._mapping)
        return result

pipe = Pipeline([
    ('column_renamer', ColumnRenamer({'scheme': 'protocol_type'})),
])

result = pipe.transform(result)

display(result[['url', 'protocol_type']].head(5))

#### Content-type

In [None]:
import json


class Formatter:
    @staticmethod
    def get_timestamp(format="%Y%m%d_%H%M%S"):
        return str((datetime.now().strftime(format)))

    @staticmethod
    def to_lower_case_dict(pair):
        if None is pair:
            return dict()
        return dict((k.lower(), v.lower()) for k, v in pair.items())


class ContentTypeExtractor(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self,x,y=None):
        return self
        
    def transform(self,x,y=None):
        result = x
        result.loc[:,'content_type'] = result.header \
            .apply(eval) \
            .apply(Formatter.to_lower_case_dict) \
            .apply(lambda x: x['content-type'] if 'content-type' in x else None)

        return result

    
pipe = Pipeline([
    ('content_type_extractor', ContentTypeExtractor()),
])

result = pipe.transform(result)

display(result[['content_type']].head(5))

In [None]:
class BinaryNAEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self._columns = columns

    @property
    def columns(self):
        return self._columns

    def fit(self,x,y=None):
        return self
        
    def transform(self,x,y=None):
        result = x
        self._columns = [col_name for col_name in self._columns if col_name in result.columns]
        for col_name in self._columns:
            result.loc[:,f'has_{col_name}'] = result[col_name] \
                .apply(lambda x: x not in [np.nan, None]) \
                .map({True: 1, False: 0})

        return result

pipe = Pipeline([
    ('binary_na_encoder', BinaryNAEncoder(['content_type'])),
])

result = pipe.transform(result)

display(result[['has_content_type']].head(5))

## EDA

In [None]:
def print_unique_count(df):
    df_unique = pd.DataFrame()
    for col_name in df.columns:
        df_unique[col_name] = [len(df[col_name].unique())]

    df_unique['total'] = [len(df)]
    df_unique.index = ['unique count']
    return df_unique.T.iloc[:,0]

def print_na_count(df):
    df_na = pd.DataFrame()
    for col_name in df.columns:
        df_na[col_name] = [df[col_name].isna().sum()]

    df_na['total'] = [len(df)]
    df_na.index = ['na count']
    return df_na.T.iloc[:,0]

display(pd.DataFrame({
    'unique count': print_unique_count(result),
    'na count': print_na_count(result)
}, index=result.columns))
result.info()

Inspect empty html records

In [None]:
no_html_records = result[result.html_text.isna()]
pdf_records = no_html_records.content_type.str.startswith('application/pdf').sum()
print(f'total empty html records: {len(no_html_records)}, pdf out of those records: {pdf_records}')

In [None]:
questionable_records = result[result.html_text.isna() & result.content_type.str.startswith('application/pdf').apply(lambda x: not x)]
questionable_records.shape

In [None]:
questionable_records.content_type.str.startswith('application/json').sum()

In [None]:
!/home/jjian03/anaconda3/bin/pip install plotly

In [None]:
import plotly

import plotly.graph_objects as go
from plotly.subplots import make_subplots


# pipe = Pipeline([
#     ('url_length_counter', URLLengthCounter()),
#     ('url_depth_counter', URLDepthCounter()),
#     ('has_www_converter', HasWWWConverter()),
#     ('subdomain_level_counter', SubdomainLevelCounter()),
#     ('request_parameter_counter', RequestParameterCounter()),
#     ('domain_suffix_builder', DomainSuffixBuilder()),
#     ('incorrect_domain_url_cleaner', IncorrectDomainUrlCleaner()),
#     ('column_renamer', ColumnRenamer({'scheme': 'protocol_type'})),
# ])

# result = pipe.transform(X_train)

non_binary_result = result[['protocol_type', 'url_length', 'url_depth', 'subdomain_level', 'param_cnt', 'suffix_idx']]

def plot_distribution(data, title, height=1200, width=800):
    fig = make_subplots(rows=len(data.columns), cols=1,
                    subplot_titles=data.columns)

    for idx, col_name in enumerate(data.columns):
        fig.add_trace(go.Histogram(x=data[col_name], name=col_name), row=idx + 1, col=1)


    fig.update_layout(height=height, width=width, title_text=title)
    return fig

plot_distribution(non_binary_result, "Non Binary Features Distribution")

In [None]:
binary_result = result[['label', 'has_www', 'is_port_access', 'has_content_type']]


plot_distribution(binary_result, "Binary Features Distribution")

Most of the non-binary feature are right skewed, it is necessary to apply the standard scaler at the later process. 

## Modeling

### Data Cleaning

#### Age of the URL

In [None]:
import math
import time
import datetime
from dateutil import relativedelta
from bson.objectid import ObjectId


class ChronologyBuilder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._scraped_dt = datetime.datetime.strptime('20200525132015', "%Y%m%d%H%M%S")
#         self._scraped_dt = datetime.datetime.now()

    def fit(self,x,y=None):
        return self

    def transform(self,x,y=None):
        result = x
        result.loc[:, 'timestamp_coef'] = result.last_available_timestamp \
            .apply(self._convert_timestamp_to_coef) \
            .fillna(self._extract_year(result.id.apply(ObjectId))) \
            .astype(int)
        return result

    def _extract_year(self, ids):
        return ids.apply(lambda x: x.generation_time.year)

    def _convert_timestamp_to_coef(self, ts):
        if None is ts or np.nan is ts or math.isnan(ts):
            return ts
        ts_str = str(ts).strip()
        if '' == ts_str:
            return ts

        ts_str = str(int(float(ts_str)))
        ts_obj = datetime.datetime.strptime(ts_str, "%Y%m%d%H%M%S")
        return ts_obj.year

pipe = Pipeline([
    ('chronology_builder', ChronologyBuilder()),
])

result = pipe.transform(result)

result.timestamp_coef.head()

#### Remove redundant features

In [None]:
class FeatureRemover(BaseEstimator, TransformerMixin):
    def __init__(self, features):
        self._removed_features = None
        self._features = features

    @property
    def removed_features(self):
        return self._removed_features

    def fit(self,x,y=None):
        return self

    def transform(self,x,y=None):
        result = x
        self._removed_features = [col_name for col_name in self._features if col_name in result.columns]
        result = result.drop(self._removed_features, axis=1)
        return result


class FeaturePicker(BaseEstimator, TransformerMixin):
    def __init__(self, features):
        self._picked_features = None
        self._features = features

    @property
    def picked_features(self):
        return self._picked_features

    def fit(self,x,y=None):
        return self

    def transform(self,x,y=None):
        result = x
        self._picked_features = [col_name for col_name in self._features if col_name in result.columns]
        result = result[self._picked_features]
        return result


pipe = Pipeline([
    ('feature_picker', FeaturePicker(['protocol_type',
                                        'url_depth',
                                        'has_www',
                                        'subdomain_level',
                                        'param_cnt',
                                        'suffix',
                                        'timestamp_coef',
                                        'is_port_access',
                                        'label',
                                       ])),
])

result = pipe.transform(result)

result.columns

#### Miscellaneous Clean Up

- Standardize variance
- Convert Categorical Feature into Frequency Based Numberical Index 
- Remove low variance features

In [None]:
from itertools import compress
from sklearn import feature_selection


class LowVarianceRemover(BaseEstimator, TransformerMixin):
    def __init__(self, threshold):
        self._p = threshold
        self._bi_vt = feature_selection.VarianceThreshold(threshold=threshold*(1-threshold))
        self._regular_vt = feature_selection.VarianceThreshold(threshold=threshold)
        self._dropped_columns = list()

    @property
    def threshold(self):
        return self._threshold

    @property
    def dropped_columns(self):
        return self._dropped_columns

    def fit(self,x,y=None):
        return self

    def transform(self,x,y=None):
        result = x

        df_unique = pd.DataFrame()
        for col_name in result.columns:
            if 'label' != col_name:
                df_unique[col_name] = [len(result[col_name].unique())]

        df_unique.index = ['unique count']
        df_unique = df_unique.T.squeeze()

        bi_columns = df_unique[df_unique == 2].index.tolist()
        regular_columns = df_unique[df_unique != 2].index.tolist()

        if len(bi_columns) >0:
            self._bi_vt.fit(result[bi_columns])
            bi_mask = self._bi_vt.variances_ < self._p * (1 - self._p)            
            self._dropped_columns = self._dropped_columns + list(compress(bi_columns, bi_mask))
        if len(regular_columns) >0 :
            self._regular_vt.fit(result[regular_columns])
            regular_mask = self._regular_vt.variances_ < self._p
            self._dropped_columns = self._dropped_columns + list(compress(regular_columns, regular_mask))

        if len(self._dropped_columns) > 0:
            remover = FeatureRemover(self._dropped_columns)
            result = remover.transform(result)
        return result


pipe = Pipeline([
    ('frequency_indexer', categorical_encoders.CountFrequencyCategoricalEncoder(
        encoding_method='frequency',
        variables=['protocol_type'])),
    ('low_variance_remover', LowVarianceRemover(0.01))

])


result = pipe.fit_transform(result)


print(f'Before transform: {X_train.columns}\n')
print(f'After transform: {result.columns}\n')
print(f'Dropped columns: {pipe.steps[-1][1].dropped_columns}')

The port indicator is wiped out, but I believe this could be a reason to explain the availability of the url resource, so I will separately build a subset to analyze that part later.

#### Logistic Regression

In [None]:
import gc
import multiprocessing

import warnings
warnings.filterwarnings("ignore")


cpu_cnt = multiprocessing.cpu_count()
allocated_cpu = cpu_cnt
print(f"Allocated {allocated_cpu} CPUs")
gc.collect()

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.naive_bayes import GaussianNB


class AnalysisEngineBuilder:

    def __init_(self):
        self._X_train = None
        self._y_train = None
        self._X_test = None
        self._y_test = None
        self._param_grid = None
        self._engine = None
    def set_X_train(self, X_train):
        self._X_train = X_train
        return self
    def set_y_train(self, y_train):
        self._y_train = y_train
        return self
    def set_X_test(self, X_test):
        self._X_test = X_test
        return self
    def set_y_test(self, y_test):
        self._y_test = y_test
        return self
    def set_param_grid(self, param_grid):
        self._param_grid = param_grid
        return self
    def set_engine(self, engine):
        self._engine = engine
        return self
    def build(self):
        return AnalysisEngineBuilder._AnalysisEngine(self._X_train, self._y_train, self._X_test, self._y_test, self._param_grid, self._engine)

    class _AnalysisEngine:
        def __init__(self, X_train, y_train, X_test, y_test, param_grid, engine):
            self._X_train = X_train
            self._y_train = y_train
            self._X_test = X_test
            self._y_test = y_test
            self._param_grid = param_grid
            self._engine = engine
            self._grid = GridSearchCV(self._engine, self._param_grid, cv=10, scoring='roc_auc')
            self._pred = None
            self._pred_prob = None
            self._accuracy = None
            self._roc = None
            self._tpr = None
            self._fpr = None
        @property
        def grid_search_result(self):
            return pd.DataFrame(self._grid.cv_results_)
        @property
        def accuracy(self):
            return self._accuracy
        @property
        def roc(self):
            return self._roc
        @property
        def tpr(self):
            return self._tpr
        @property
        def fpr(self):
            return self._fpr
        @property
        def threshold(self):
            return self._threshold
        def analyze(self):
            self._grid.fit(self._X_train, self._y_train)
            self._pred = self._grid.predict(self._X_test)
            self._fpr, self._tpr, self._threshold = roc_curve(self._y_test, self._pred)
            try:
                self._pred_prob = self._grid.predict_proba(self._X_test)
                self._pred_prob = pd.DataFrame(self._pred_prob)[1]
                self._fpr, self._tpr, self._threshold = roc_curve(self._y_test, self._pred_prob)
            except AttributeError as ae:
                self._pred_prob = self._pred
            self._accuracy = accuracy_score(self._y_test, self._pred)
            self._roc = roc_auc_score(self._y_test, self._pred_prob)

            return self._grid

        def show_performance(self):
            print(f"ROC/AUC: {round(self._roc*100, 2)}%")
            print()
            print(classification_report(self._y_test, self._pred, target_names=["Valid Url","Invalid"]))

In [None]:
import matplotlib


class Visualizer():
    @staticmethod
    def group_plot_roc_curve(title, data_group):
        plt.clf()
        plt.figure(figsize=(5, 5), dpi=80)

        x = [0.0, 1.0]
        plt.plot(x, x, linestyle='dashed', color='red', linewidth=2, label='Naive prediction (Random guess)')
        for idx, group in enumerate(data_group):
            fpr = group[0]
            tpr = group[1]
            label = group[2]
            linestyle= 'solid'
            if idx % 2 == 1:
                linestyle= 'dashed'
            plt.plot(fpr, tpr, linestyle=linestyle, linewidth=2, label=label)

        plt.xlim(0.0, 1.0)
        plt.ylim(0.0, 1.0)
        plt.xlabel("FPR", fontsize=14)
        plt.ylabel("TPR", fontsize=14)

        plt.legend(fontsize=10, loc='lower right')

        plt.title(title, fontsize=14)
        plt.tight_layout()
        
        plt.show()
        return plt

    @staticmethod
    def plot_performance(data,
                            legend_type_name,
                            x_axis_name,
                            upper_y_label,
                            lower_y_label,
                            title):
        plt.clf()
        f, ax = plt.subplots(2, 1, figsize=(15,8))
        legends = data[legend_type_name].unique()
        for idx, legend in enumerate(legends):
            _data = data[data[legend_type_name]==legend]
            ax[0].plot(_data[x_axis_name], _data[upper_y_label], linewidth=2, label=f'{legend_type_name}: {legend}')
            ax[0].set_xlabel(x_axis_name, fontsize=15)
            ax[0].set_ylabel(upper_y_label.upper(), fontsize=15)
            ax[0].legend(fontsize=10, loc='upper right')

            ax[1].plot(_data[x_axis_name], _data[lower_y_label], linewidth=2, label=f'{legend_type_name}: {legend}')
            ax[1].set_xlabel(x_axis_name, fontsize=15)
            ax[1].set_ylabel(lower_y_label.upper(), fontsize=15)
            ax[1].legend(fontsize=10, loc='lower right')

        ax[0].set_title(f"Performance Evaluation of {title}", fontsize=24)
        plt.tight_layout()

        plt.show()
        return plt

    @staticmethod
    def plot_feature_importance(reg_coef, col_names, title):
        reg_coef = pd.Series(reg_coef, index=col_names)
        reg_coef = reg_coef.sort_values()
        matplotlib.rcParams['figure.figsize'] = (8.0, 10.0)
        reg_coef.plot(kind="barh",)
        plt.title(title, fontsize=15)

        return plt

    @staticmethod
    def plot_importance_trending(X_train, feature_importance_matrix, title, offset=3):
        feature_importance = feature_importance_matrix.groupby('C').agg(['mean'])[[*X_train.columns]]
        feature_importance.columns = X_train.columns.tolist()
        feature_importance['C'] = feature_importance.index
        
        column_names = X_train.columns
        lbds = feature_importance['C'].tolist()
        coef_matrix = feature_importance[X_train.columns]
        x_lab = 'Lambda'
        y_lab = 'Weight'
        plt.clf()
        plt.figure(figsize=(15, 10))
        for idx, col_name in enumerate(column_names):
            plt.plot(lbds, coef_matrix.iloc[:,idx], 'o-', linewidth=2, label=col_name)
            c = coef_matrix.iloc[0,idx]
            plt.annotate(col_name, (lbds[offset], coef_matrix.iloc[offset,idx]))

        plt.title(title, fontSize=25)
        plt.xlabel(x_lab)
        plt.ylabel(y_lab)

        plt.legend(loc='upper right')
        plt.tight_layout()

        return plt

In [None]:
from concurrent.futures.thread import ThreadPoolExecutor
import warnings
warnings.filterwarnings("ignore")
from concurrent.futures.thread import ThreadPoolExecutor
from sklearn.metrics import hinge_loss


def loss_accuracy_analyze_job_builder(X_train, y_train, X_test, y_test, model_func, param):
    def _analyze_param_combination():
        engine = AnalysisEngineBuilder() \
                    .set_X_train(X_train) \
                    .set_y_train(y_train) \
                    .set_X_test(X_test) \
                    .set_y_test(y_test) \
                    .set_param_grid(param) \
                    .set_engine(model_func) \
                    .build()
        model = engine.analyze()
        
        # Performance scores
        proba = pd.DataFrame(model.predict_proba(X_test))[1]
        loss = hinge_loss(y_test, proba)
        auc = roc_auc_score(y_test, proba)
        
        coef = pd.Series(model.best_estimator_.coef_[0], index=X_test.columns).to_dict()
        _param = param
        for key, value in param.items():
            _param[key] = value[0]
        return {
            'accuracy': engine.accuracy * 100,
            'loss': loss,
            'auc': auc,
            **coef,
            **_param
        }
    return _analyze_param_combination

# Refactor into the analyzer later on
def calculate_grid_performance(X_train, y_train, X_test, y_test, params, model):
    # build combination list
    combination_list = pd.DataFrame({'dummy': [1]})
    for key, values in params.items():
        combination_list = pd.merge(combination_list, pd.DataFrame({key: values, 'dummy': [1] * len(values)}))
    combination_list.drop('dummy',axis=1, inplace=True)

    # Train and extract scores
    futures = list()
    results = list()
    # Execute models in threads
    with ThreadPoolExecutor(max_workers=allocated_cpu) as executor:
        for combination in combination_list.to_dict('records'):
            combination = {key:[value] for key, value in combination.items()}
            future_model = executor.submit(loss_accuracy_analyze_job_builder(X_train, y_train, X_test, y_test, model, combination))
            futures.append(future_model)
        return pd.DataFrame.from_dict([future.result() for future in futures])

In [None]:
pipe = Pipeline([
    ('url_length_counter', URLLengthCounter()),
    ('url_parser', URLParser()),
    ('url_depth_counter', URLDepthCounter()),
    ('has_www_converter', HasWWWConverter()),
    ('subdomain_level_counter', SubdomainLevelCounter()),
    ('request_parameter_counter', RequestParameterCounter()),
    ('domain_suffix_builder', DomainSuffixBuilder()),
    ('incorrect_domain_url_cleaner', IncorrectDomainUrlCleaner()),
    ('column_renamer', ColumnRenamer({'scheme': 'protocol_type'})),
    ('content_type_extractor', ContentTypeExtractor()),
    ('binary_na_encoder', BinaryNAEncoder(['content_type'])),
    ('chronology_builder', ChronologyBuilder()),
    ('feature_picker', FeaturePicker(['protocol_type',
                                        'url_depth',
                                        'has_www',
                                        'subdomain_level',
                                        'param_cnt',
                                        'suffix',
                                        'timestamp_coef',
                                        'is_port_access',
#                                         'has_content_type',
                                        'label'
                                       ])),
    ('frequency_indexer', categorical_encoders.CountFrequencyCategoricalEncoder(
        encoding_method='frequency',
        variables=['protocol_type'])),
    # Low Variance Filter works incorrectly.
    ('low_variance_remover', LowVarianceRemover(.005)), # Decreased to .005
    ('feature_remover', FeatureRemover([
                                        'param_cnt',
                                        'is_port_access',
                                       ])),

])


train = DataSource().X_train
train.loc[:,'label'] = DataSource().y_train
X_train = pipe.fit_transform(train)
y_train = X_train.iloc[:,-1]
# X_train = pd.DataFrame(X_train, columns= pipe.steps[-1][1].columns)
X_train = X_train.drop('label', axis=1)
print(X_train.columns)


test = DataSource().X_test
test.loc[:,'label'] = DataSource().y_test
X_test = pipe.fit_transform(test)
y_test = X_test.iloc[:,-1]
# X_test = pd.DataFrame(X_test, columns= pipe.steps[-1][1].columns)
X_test = X_test.drop('label', axis=1)
print(X_test.columns)

In [None]:
from sklearn.linear_model import LogisticRegression


start_time = time.time()
param_lr = {
    'l1_ratio': [0, *np.logspace(-3, 0, 5)],
    'C': sorted(np.logspace(-3, -2, 20)),
    'max_iter': np.arange(10,80,40),
}

lr = LogisticRegression(random_state=seed,
                       penalty='elasticnet',
                       solver='saga',
                       multi_class='ovr',
                       warm_start=False,
                       n_jobs=allocated_cpu,
)


# Start to train model
engine_lr = AnalysisEngineBuilder() \
    .set_X_train(X_train) \
    .set_y_train(y_train) \
    .set_X_test(X_test) \
    .set_y_test(y_test) \
    .set_param_grid(param_lr) \
    .set_engine(lr) \
    .build()

model_lr = engine_lr.analyze()
engine_lr.show_performance()

t = str(datetime.timedelta(seconds=time.time() - start_time)).split(':')
print("--- %s minutes, %.2f seconds ---" % (t[1], float(t[2])))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

Visualizer.group_plot_roc_curve('ROC Curve of Logistic Regression', [
    (engine_lr.fpr, engine_lr.tpr, 'Logistic Regression')
])

In [None]:
param_lr = {
    'l1_ratio': [0, *np.logspace(-3, 0, 5)],
    'C': sorted(np.logspace(-3, -2, 20)),
    'max_iter': np.arange(10,80,40),
}

lr = LogisticRegression(random_state=seed,
                        penalty='elasticnet',
                        solver='saga',
                        multi_class='ovr',
                        warm_start=False,
                        n_jobs=allocated_cpu,
)

loss_accuracy_matrix = calculate_grid_performance(X_train, y_train, X_test, y_test, param_lr, lr)

In [None]:
Visualizer.plot_performance(data=loss_accuracy_matrix,
                    legend_type_name='l1_ratio',
                    x_axis_name='C',
                    upper_y_label='loss',
                    lower_y_label='auc',
                    title='Loss& Accuracy - Logistic Regression'
)

In [None]:
Visualizer.plot_importance_trending(X_train, loss_accuracy_matrix, 'Weight change on each feature', offset=15)

#### Try encode the suffix index label with logarithm

In [None]:
class LogarithmTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self._columns = columns

    def fit(self,x,y=None):
        return self
        
    def transform(self,x,y=None):
        result = x
        result.loc[:, self._columns] = (result[self._columns]+0.00000000001).applymap(math.log)

        return result

pipe = Pipeline([
    ('url_length_counter', URLLengthCounter()),
    ('url_depth_counter', URLDepthCounter()),
    ('has_www_converter', HasWWWConverter()),
    ('subdomain_level_counter', SubdomainLevelCounter()),
    ('request_parameter_counter', RequestParameterCounter()),
    ('domain_suffix_builder', DomainSuffixBuilder()),
    ('incorrect_domain_url_cleaner', IncorrectDomainUrlCleaner()),
    ('column_renamer', ColumnRenamer({'scheme': 'protocol_type'})),
    ('binary_na_encoder', BinaryNAEncoder(['content_type'])),
    ('chronology_builder', ChronologyBuilder()),
    ('feature_picker', FeaturePicker(['protocol_type',
                                        'url_depth',
                                        'has_www',
                                        'subdomain_level',
                                        'param_cnt',
                                        'suffix',
                                        'timestamp_coef',
                                        'is_port_access',
#                                         'has_content_type',
                                        'label'
                                       ])),
    ('logarithm_transformer', LogarithmTransformer(['suffix'])),
    ('frequency_indexer', categorical_encoders.CountFrequencyCategoricalEncoder(
        encoding_method='frequency',
        variables=['protocol_type'])),
    # Low Variance Filter works incorrectly.
#     ('low_variance_remover', LowVarianceRemover(.005)), # Decreased to .005
    ('feature_remover', FeatureRemover([
                                        'param_cnt',
                                        'is_port_access',
                                       ])),

])

train = DataSource().X_train
train.loc[:,'label'] = DataSource().y_train
X_train = pipe.fit_transform(train)
y_train = X_train.iloc[:,-1]
# X_train = pd.DataFrame(X_train, columns= pipe.steps[-1][1].columns)
X_train = X_train.drop('label', axis=1)
print(X_train.columns)


test = DataSource().X_test
test.loc[:,'label'] = DataSource().y_test
X_test = pipe.fit_transform(test)
y_test = X_test.iloc[:,-1]
# X_test = pd.DataFrame(X_test, columns= pipe.steps[-1][1].columns)
X_test = X_test.drop('label', axis=1)
print(X_test.columns)

In [None]:
start_time = time.time()
param_lr = {
    'l1_ratio': [0, *np.logspace(-3, 0, 5)],
    'C': sorted(np.logspace(-3, -2, 20)),
    'max_iter': np.arange(10,80,40),
}

lr = LogisticRegression(random_state=seed,
                       penalty='elasticnet',
                       solver='saga',
                       multi_class='ovr',
                       warm_start=False,
                       n_jobs=allocated_cpu,
)


# Start to train model
engine_lr = AnalysisEngineBuilder() \
    .set_X_train(X_train) \
    .set_y_train(y_train) \
    .set_X_test(X_test) \
    .set_y_test(y_test) \
    .set_param_grid(param_lr) \
    .set_engine(lr) \
    .build()

model_lr = engine_lr.analyze()
engine_lr.show_performance()

t = str(datetime.timedelta(seconds=time.time() - start_time)).split(':')
print("--- %s minutes, %.2f seconds ---" % (t[1], float(t[2])))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

Visualizer.group_plot_roc_curve('ROC Curve of Logistic Regression', [
    (engine_lr.fpr, engine_lr.tpr, 'Logistic Regression')
])

In [None]:
param_lr = {
    'l1_ratio': [0, *np.logspace(-3, 0, 5)],
    'C': sorted(np.logspace(-3, -2, 20)),
    'max_iter': np.arange(10,80,40),
}

lr = LogisticRegression(random_state=seed,
                        penalty='elasticnet',
                        solver='saga',
                        multi_class='ovr',
                        warm_start=False,
                        n_jobs=allocated_cpu,
)

loss_accuracy_matrix = calculate_grid_performance(X_train, y_train, X_test, y_test, param_lr, lr)

In [None]:
Visualizer.plot_performance(data=loss_accuracy_matrix,
                    legend_type_name='l1_ratio',
                    x_axis_name='C',
                    upper_y_label='loss',
                    lower_y_label='auc',
                    title='Loss& Accuracy - Logistic Regression'
)

In [None]:
Visualizer.plot_importance_trending(X_train, loss_accuracy_matrix, 'Weight change on each feature', offset=15)

#### Try Tong's methd

In [None]:
class DummySuffixDescritizer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self,x,y=None):
        return self
        
    def transform(self,x,y=None):
        result = x
        dummies = pd.get_dummies(result.suffix_idx)
        dummies = FeaturePicker(['int', 'org', 'gov', 'in', 'eu', 'cn']).fit_transform(dummies)
        result = result.drop('suffix_idx', axis = 1).join(dummies, how='inner')

        return result

pipe = Pipeline([
    ('url_length_counter', URLLengthCounter()),
    ('url_depth_counter', URLDepthCounter()),
    ('has_www_converter', HasWWWConverter()),
    ('subdomain_level_counter', SubdomainLevelCounter()),
    ('request_parameter_counter', RequestParameterCounter()),
    ('domain_suffix_builder', DomainSuffixBuilder()),
    ('incorrect_domain_url_cleaner', IncorrectDomainUrlCleaner()),
    ('column_renamer', ColumnRenamer({'scheme': 'protocol_type'})),
    ('binary_na_encoder', BinaryNAEncoder(['content_type'])),
    ('chronology_builder', ChronologyBuilder()),
    ('feature_picker', FeaturePicker(['protocol_type',
                                        'url_depth',
                                        'has_www',
                                        'subdomain_level',
                                        'param_cnt',
                                        'suffix_idx',
                                        'timestamp_coef',
                                        'is_port_access',
#                                         'has_content_type',
                                        'label'
                                       ])),
    ('dummy_suffix_descritizer', DummySuffixDescritizer()),
    ('frequency_indexer', categorical_encoders.CountFrequencyCategoricalEncoder(
        encoding_method='frequency',
        variables=['protocol_type'])),
    # Low Variance Filter works incorrectly.
#     ('low_variance_remover', LowVarianceRemover(.005)), # Decreased to .005
    ('feature_remover', FeatureRemover([
                                        'param_cnt',
                                        'is_port_access',
                                       ])),

])

train = DataSource().X_train
train.loc[:,'label'] = DataSource().y_train
X_train = pipe.fit_transform(train)
y_train = X_train.iloc[:,-1]
# X_train = pd.DataFrame(X_train, columns= pipe.steps[-1][1].columns)
X_train = X_train.drop('label', axis=1)
print(X_train.columns)


test = DataSource().X_test
test.loc[:,'label'] = DataSource().y_test
X_test = pipe.fit_transform(test)
y_test = X_test.iloc[:,-1]
# X_test = pd.DataFrame(X_test, columns= pipe.steps[-1][1].columns)
X_test = X_test.drop('label', axis=1)
print(X_test.columns)

In [None]:
start_time = time.time()
param_lr = {
    'l1_ratio': [0, *np.logspace(-3, 0, 5)],
    'C': sorted(np.logspace(-3, -2, 20)),
    'max_iter': np.arange(10,80,40),
}

lr = LogisticRegression(random_state=seed,
                       penalty='elasticnet',
                       solver='saga',
                       multi_class='ovr',
                       warm_start=False,
                       n_jobs=allocated_cpu,
)


# Start to train model
engine_lr = AnalysisEngineBuilder() \
    .set_X_train(X_train) \
    .set_y_train(y_train) \
    .set_X_test(X_test) \
    .set_y_test(y_test) \
    .set_param_grid(param_lr) \
    .set_engine(lr) \
    .build()

model_lr = engine_lr.analyze()
engine_lr.show_performance()

t = str(datetime.timedelta(seconds=time.time() - start_time)).split(':')
print("--- %s minutes, %.2f seconds ---" % (t[1], float(t[2])))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

Visualizer.group_plot_roc_curve('ROC Curve of Logistic Regression', [
    (engine_lr.fpr, engine_lr.tpr, 'Logistic Regression')
])

In [None]:
param_lr = {
    'l1_ratio': [0, *np.logspace(-3, 0, 5)],
    'C': sorted(np.logspace(-3, -2, 20)),
    'max_iter': np.arange(10,80,40),
}

lr = LogisticRegression(random_state=seed,
                        penalty='elasticnet',
                        solver='saga',
                        multi_class='ovr',
                        warm_start=False,
                        n_jobs=allocated_cpu,
)

loss_accuracy_matrix = calculate_grid_performance(X_train, y_train, X_test, y_test, param_lr, lr)

In [None]:
Visualizer.plot_performance(data=loss_accuracy_matrix,
                    legend_type_name='l1_ratio',
                    x_axis_name='C',
                    upper_y_label='loss',
                    lower_y_label='auc',
                    title='Loss& Accuracy - Logistic Regression'
)

In [None]:
Visualizer.plot_importance_trending(X_train, loss_accuracy_matrix, 'Weight change on each feature', offset=15)

### Feature Engineering - Second Round

#### Features in source code

- Restore the test data

In [None]:
# First round pipeline

pipe_1st = Pipeline([
    ('url_length_counter', URLLengthCounter()),
    ('url_depth_counter', URLDepthCounter()),
    ('has_www_converter', HasWWWConverter()),
    ('subdomain_level_counter', SubdomainLevelCounter()),
    ('request_parameter_counter', RequestParameterCounter()),
    ('domain_suffix_builder', DomainSuffixBuilder()),
    ('incorrect_domain_url_cleaner', IncorrectDomainUrlCleaner()),
    ('column_renamer', ColumnRenamer({'scheme': 'protocol_type'})),
    ('chronology_builder', ChronologyBuilder()),
])


train = DataSource().X_train
train.loc[:,'label'] = DataSource().y_train

result = pipe_1st.fit_transform(train)
print(type(result))

##### Code length(kb)

In [None]:
class SourceCodeByteCounter(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self,x,y=None):
        return self
        
    def transform(self,x,y=None):
        result = x
        result['code_size'] = result.html_text \
            .replace(np.nan, '', regex=True) \
            .astype(str) \
            .apply(len)

        return result

pipe = Pipeline([
    ('source_code_byte_counter', SourceCodeByteCounter()),
])

print(type(result))

result = pipe.fit_transform(result)

result.code_size.head()

###### is HTML5

In [None]:
class HTML5Justifier(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self,x,y=None):
        return self
        
    def transform(self,x,y=None):
        result = x
        result['is_html5'] = result.html_text \
            .replace(np.nan, '', regex=True) \
            .apply(lambda x: int(x.replace('\n','').replace('\r','').strip().lower().startswith('<!doctype html>') if x else False))

        return result

pipe = Pipeline([
    ('html5_justifier', HTML5Justifier()),
])

result = pipe.fit_transform(result)

result.is_html5.head()

#### Text Mining

In [None]:
class BeautifulSoupParserBuilder:

    class _BeautifulSoupParser(BaseEstimator, TransformerMixin):
        def __init__(self,_lambda_pair):
            self._lambda_pair = _lambda_pair

        def fit(self,x,y=None):
            return self

        def transform(self,x,y=None):
            result = x
            soup_handlers = result.html_text \
                    .replace(np.nan, '', regex=True) \
                    .apply(lambda html_doc: BeautifulSoupParserBuilder._safe_create_parser(html_doc))
            
            for col_name, func in self._lambda_pair.items():
                result[col_name] = soup_handlers.apply(func)

            return result

    @staticmethod
    def _safe_create_parser(html_doc):
        try:
            return BeautifulSoup(html_doc, 'html.parser')
        except:
            return BeautifulSoup('', 'html.parser')
        
    def __init__(self):
        self._lambda_pair = dict()

    def add_lambda(self, column_name, lbd):
        self._lambda_pair[column_name] = lbd
        return self

    def build(self):
        return BeautifulSoupParserBuilder._BeautifulSoupParser(self._lambda_pair)



##### Title Length

In [None]:
from bs4 import BeautifulSoup


def get_title_length(soup):
    title = soup.title.string if soup.title else ''
    if not title:
        title = ''
    return len(title)


###### Types of the JS library

- Extract this feature later when running the association rule.

###### No of JS files

In [None]:
def count_internal_js_lib(soup):
    sources=soup.findAll('script',{"src":True})
    return len([0 for source in sources if not source['src'].startswith('http')])

def count_external_js_lib(soup):
    sources=soup.findAll('script',{"src":True})
    return len([0 for source in sources if source['src'].startswith('http')])


###### Charset

In [None]:
def get_charset(soup):
    sources=soup.findAll('meta',{"charset":True})
    if 0 == len(sources):
        return ''
    return sources[0]['charset'].lower().replace('\'', '').replace('"', '')


###### iFrame in Body

In [None]:
def has_iframe(soup):
    sources=soup.findAll('iframe')
    return int(0 == len(sources))


###### No of hyperlink

In [None]:
def count_hyperlink(soup):
    sources=soup.findAll('a')
    return len([1 for source in sources if source.has_attr('href') and source['href'].lower().startswith('http')])


###### Drop the records that does not have html code

In [None]:
class EmptyHTMLFilter(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self,x,y=None):
        return self
        
    def transform(self,x,y=None):
        result = x
        result = result.dropna(subset=['html_text'])

        return result

result = EmptyHTMLFilter().fit_transform(result)

###### Consolidate the pipeline

In [None]:
html_parser = BeautifulSoupParserBuilder() \
    .add_lambda('title_length', get_title_length) \
    .add_lambda('internal_js_cnt', count_internal_js_lib) \
    .add_lambda('external_js_cnt', count_external_js_lib) \
    .add_lambda('charset', get_charset) \
    .add_lambda('has_iframe', has_iframe) \
    .add_lambda('hyperlink_cnt', count_hyperlink) \
    .build()

pipe = Pipeline([
    ('url_length_counter', URLLengthCounter()),
    ('url_depth_counter', URLDepthCounter()),
    ('has_www_converter', HasWWWConverter()),
    ('subdomain_level_counter', SubdomainLevelCounter()),
    ('request_parameter_counter', RequestParameterCounter()),
    ('domain_suffix_builder', DomainSuffixBuilder()),
    ('incorrect_domain_url_cleaner', IncorrectDomainUrlCleaner()),
    ('column_renamer', ColumnRenamer({'scheme': 'protocol_type'})),
    ('source_code_byte_counter', SourceCodeByteCounter()),
    ('html5_justifier', HTML5Justifier()),
    ('empty_html_filter', EmptyHTMLFilter()),
    ('html_parser', html_parser),
    ('feature_picker', FeaturePicker(['protocol_type',
                                      'url_depth',
                                      'has_www',
                                      'subdomain_level',
                                      'param_cnt',
                                      'suffix',
                                      'timestamp_coef',
                                      'is_port_access',
                                      'code_size',
                                      'title_length',
                                      'internal_js_cnt',
                                      'external_js_cnt',
                                      'charset',
                                      'is_html5',
                                      'has_iframe',
                                      'hyperlink_cnt',
                                      'html_text',
                                      'label',
                                       ])),
#     ('logarithm_transformer', LogarithmTransformer(['suffix'])),
#     ('frequency_indexer', categorical_encoders.CountFrequencyCategoricalEncoder(
#         encoding_method='frequency',
#         variables=['protocol_type'])),
#     ('low_variance_remover', LowVarianceRemover(.005)), # Decreased to .005

])


train = DataSource().X_train
train.loc[:,'label'] = DataSource().y_train

result = pipe.fit_transform(train)

result[[
    'title_length',
    'internal_js_cnt',
    'external_js_cnt',
    'charset',
    'has_iframe',
    'hyperlink_cnt']].head()

###### Remove tags, Tf-Idf Score of Body

###### Tf-Idf Score of Header

### EDA - Second Round

In [None]:
result.loc[:,'charset'] = result.loc[:,'charset'].apply(lambda x: x.replace('\'', '').replace('"', ''))
result.info()

In [None]:
result.charset.value_counts()

In [None]:
# plot_distribution(result, "Features Distribution", height=2400)

Convert binary features into numeric variables

In [None]:
class FeatureValueMapper(BaseEstimator, TransformerMixin):
    def __init__(self, column_name, mapping):
        self._column_name = column_name
        self._mapping = mapping

    @property
    def column_name(self):
        return self._column_name

    @property
    def mapping(self):
        return self._mapping

    def fit(self,x,y=None):
        result = x
        result.loc[:,self._column_name] = result[self._column_name].map(self._mapping)
        return self
        
    def transform(self,x,y=None):
        result = x
        return result


In [None]:
pipe = Pipeline([
    ('url_length_counter', URLLengthCounter()),
    ('url_depth_counter', URLDepthCounter()),
    ('has_www_converter', HasWWWConverter()),
    ('subdomain_level_counter', SubdomainLevelCounter()),
    ('request_parameter_counter', RequestParameterCounter()),
    ('domain_suffix_builder', DomainSuffixBuilder()),
    ('incorrect_domain_url_cleaner', IncorrectDomainUrlCleaner()),
    ('column_renamer', ColumnRenamer({'scheme': 'protocol_type'})),
    ('source_code_byte_counter', SourceCodeByteCounter()),
    ('html5_justifier', HTML5Justifier()),
    ('html_parser', html_parser),
    ('chronology_builder', ChronologyBuilder()),
    ('binary_feature_converter', FeatureValueMapper('protocol_type', {
                                        'http': 1,
                                        'https':0,
                                        })),
    ('feature_picker', FeaturePicker([
                                        'protocol_type',
                                        'url_depth',
                                        'has_www',
                                        'subdomain_level',
                                        'param_cnt',
                                        'suffix',
                                        'timestamp_coef',
                                        'is_port_access',
                                        'code_size',
                                        'title_length',
                                        'internal_js_cnt',
                                        'external_js_cnt',
                                        'charset',
                                        'is_html5',
                                        'has_iframe',
                                        'hyperlink_cnt',
                                        'label',
                                       ])),
    ('feature_remover', FeatureRemover([
                                        'param_cnt',
                                        'is_port_access',
                                       ])),
    ('frequency_indexer', categorical_encoders.CountFrequencyCategoricalEncoder(
        encoding_method='frequency',
        variables=['charset'])),
    ('logarithm_transformer', LogarithmTransformer([
                                        'suffix',
                                        'title_length',
                                        'internal_js_cnt',
                                        'external_js_cnt',
                                        'hyperlink_cnt',
#                                         'protocol_type',
#                                         'charset'
    ])),

])


train = DataSource().X_train
train.loc[:,'label'] = DataSource().y_train
X_train = pipe.fit_transform(train)
y_train = X_train.iloc[:,-1]
# X_train = pd.DataFrame(X_train, columns= pipe.steps[-1][1].columns)
X_train = X_train.drop('label', axis=1)
print(X_train.columns)


test = DataSource().X_test
test.loc[:,'label'] = DataSource().y_test
X_test = pipe.fit_transform(test)
y_test = X_test.iloc[:,-1]
# X_test = pd.DataFrame(X_test, columns= pipe.steps[-1][1].columns)
X_test = X_test.drop('label', axis=1)
print(X_test.columns)

train.to_csv('train_lr.csv')
test.to_csv('test_lr.csv')

In [None]:
# X_train_copy = X_train.copy()
# X_test_copy = X_test.copy()

In [None]:
# X_train = X_train.drop(['code_size', 'is_html5'], axis=1)
# X_test = X_test.drop(['code_size', 'is_html5'], axis=1)

In [None]:
# X_train = X_train_copy
# X_test = X_test_copy

In [None]:
start_time = time.time()
param_lr = {
    'l1_ratio': [0, *np.logspace(-3, 0, 5)],
    'C': sorted(np.logspace(-4, -1, 50)),
    'max_iter': np.arange(10,80,40),
}

lr = LogisticRegression(random_state=seed,
                       penalty='elasticnet',
                       solver='saga',
                       multi_class='ovr',
                       warm_start=False,
                       n_jobs=allocated_cpu,
)


# Start to train model
engine_lr = AnalysisEngineBuilder() \
    .set_X_train(X_train) \
    .set_y_train(y_train) \
    .set_X_test(X_test) \
    .set_y_test(y_test) \
    .set_param_grid(param_lr) \
    .set_engine(lr) \
    .build()

model_lr = engine_lr.analyze()
engine_lr.show_performance()

t = str(datetime.timedelta(seconds=time.time() - start_time)).split(':')
print("--- %s minutes, %.2f seconds ---" % (t[1], float(t[2])))

In [None]:
param_lr = {
    'l1_ratio': [0, *np.logspace(-3, 0, 5)],
    'C': sorted(np.logspace(-4, -1, 50)),
    'max_iter': np.arange(10,80,40),
}

lr = LogisticRegression(random_state=seed,
                        penalty='elasticnet',
                        solver='saga',
                        multi_class='ovr',
                        warm_start=False,
                        n_jobs=allocated_cpu,
)

loss_accuracy_matrix = calculate_grid_performance(X_train, y_train, X_test, y_test, param_lr, lr)

In [None]:
Visualizer.plot_performance(data=loss_accuracy_matrix,
                    legend_type_name='l1_ratio',
                    x_axis_name='C',
                    upper_y_label='loss',
                    lower_y_label='auc',
                    title='Loss& Accuracy - Logistic Regression'
)

In [None]:
Visualizer.plot_importance_trending(X_train, loss_accuracy_matrix, 'Weight change on each feature', 40)

- Select the hyperparameter and train again

In [None]:
start_time = time.time()
param_lr = {
    'l1_ratio': [0.001],
    'C': [0.055],
    'max_iter': [80],
}

lr = LogisticRegression(random_state=seed,
                       penalty='elasticnet',
                       solver='saga',
                       multi_class='ovr',
                       warm_start=False,
                       n_jobs=allocated_cpu,
)


# Start to train model
engine_lr = AnalysisEngineBuilder() \
    .set_X_train(X_train) \
    .set_y_train(y_train) \
    .set_X_test(X_test) \
    .set_y_test(y_test) \
    .set_param_grid(param_lr) \
    .set_engine(lr) \
    .build()

model_lr = engine_lr.analyze()
engine_lr.show_performance()

t = str(datetime.timedelta(seconds=time.time() - start_time)).split(':')
print("--- %s minutes, %.2f seconds ---" % (t[1], float(t[2])))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

engine_lr.fpr

Visualizer.group_plot_roc_curve('ROC Curve of Logistic Regression', [
    (engine_lr.fpr, engine_lr.tpr, 'Logistic Regression')
])

In [None]:
Visualizer.plot_feature_importance(
    model_lr.best_estimator_.coef_[0], X_train.columns, 
    "Coefficients in the Logistic Regression")

### EDA - Third Round

#### Explore the features from MAG dataset

In [None]:
@singleton
class NanToZeroConverter(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self._columns = columns

    def fit(self,x,y=None):
        return self
        
    def transform(self,x,y=None):
        result = x
        self._columns = [col_name for col_name in self._columns if col_name in result.columns]
        for col_name in self._columns:
            result.loc[:, col_name] = result[col_name].fillna(0)
        return result


In [None]:
pipe = Pipeline([
    ('url_length_counter', URLLengthCounter()),
    ('url_depth_counter', URLDepthCounter()),
    ('has_www_converter', HasWWWConverter()),
    ('subdomain_level_counter', SubdomainLevelCounter()),
    ('request_parameter_counter', RequestParameterCounter()),
    ('domain_suffix_builder', DomainSuffixBuilder()),
    ('incorrect_domain_url_cleaner', IncorrectDomainUrlCleaner()),
    ('column_renamer', ColumnRenamer({'scheme': 'protocol_type'})),
    ('source_code_byte_counter', SourceCodeByteCounter()),
    ('html5_justifier', HTML5Justifier()),
    ('html_parser', html_parser),
    ('chronology_builder', ChronologyBuilder()),
    ('binary_feature_converter', FeatureValueMapper('protocol_type', {
                                        'http': 1,
                                        'https':0,
                                        })),
    ('nan_to_Zero_converter', NanToZeroConverter(
        'total_num_of_paper_citing',
        'total_num_of_author_citing',
        'total_num_of_affiliation_citing',
        'total_num_of_journal_citing',
        'total_num_of_author_self_citation',
        'total_num_of_affiliation_self_citation',
        'total_num_of_journal_self_citation',
        'avg_year',
        'min_year',
        'max_year',
        'median',
        'num_of_author',
        'num_of_author_citing',
        'num_of_affiliation_citing',
        'num_of_journal_citing',
        'avg_hindex',
        'first_author_hindex',
        'last_author_hindex',
        'avg_mid_author_hindex',
        'paper_unique_affiliation'
    )),
    
    ('feature_picker', FeaturePicker([
                                        'protocol_type',
                                        'url_depth',
                                        'has_www',
                                        'subdomain_level',
                                        'param_cnt',
                                        'suffix',
                                        'timestamp_coef',
                                        'is_port_access',
                                        'code_size',
                                        'title_length',
                                        'internal_js_cnt',
                                        'external_js_cnt',
                                        'charset',
                                        'is_html5',
                                        'has_iframe',
                                        'hyperlink_cnt',

                                        'total_num_of_paper_citing',
                                        'total_num_of_author_citing',
                                        'total_num_of_affiliation_citing',
                                        'total_num_of_journal_citing',
                                        'total_num_of_author_self_citation',
                                        'total_num_of_affiliation_self_citation',
                                        'total_num_of_journal_self_citation',
                                        'avg_year',
                                        'min_year',
                                        'max_year',
                                        'median',
                                        'num_of_author',
                                        'num_of_author_citing',
                                        'num_of_affiliation_citing',
                                        'num_of_journal_citing',
                                        'avg_hindex',
                                        'first_author_hindex',
                                        'last_author_hindex',
                                        'avg_mid_author_hindex',
                                        'paper_unique_affiliation',

                                        'label',
                                       ])),
    ('feature_remover', FeatureRemover([
                                        'param_cnt',
                                        'is_port_access',
                                       ])),
    ('frequency_indexer', categorical_encoders.CountFrequencyCategoricalEncoder(
        encoding_method='frequency',
        variables=['charset'])),
    ('logarithm_transformer', LogarithmTransformer([
                                        'suffix',
                                        'title_length',
                                        'internal_js_cnt',
                                        'external_js_cnt',
                                        'hyperlink_cnt',
    ])),

])

In [None]:
train = DataSource().X_train
train.loc[:,'label'] = DataSource().y_train
X_train = pipe.fit_transform(train)
y_train = X_train.iloc[:,-1]
# X_train = pd.DataFrame(X_train, columns= pipe.steps[-1][1].columns)
X_train = X_train.drop('label', axis=1)
print(X_train.columns)


test = DataSource().X_test
test.loc[:,'label'] = DataSource().y_test
X_test = pipe.fit_transform(test)
y_test = X_test.iloc[:,-1]
# X_test = pd.DataFrame(X_test, columns= pipe.steps[-1][1].columns)
X_test = X_test.drop('label', axis=1)
print(X_test.columns)

In [None]:
start_time = time.time()
param_lr = {
    'l1_ratio': [0, *np.logspace(-3, 0, 5)],
    'C': sorted(np.logspace(-4, -1, 50)),
    'max_iter': np.arange(10,80,40),
}

lr = LogisticRegression(random_state=seed,
                       penalty='elasticnet',
                       solver='saga',
                       multi_class='ovr',
                       warm_start=False,
                       n_jobs=allocated_cpu,
)


# Start to train model
engine_lr = AnalysisEngineBuilder() \
    .set_X_train(X_train) \
    .set_y_train(y_train) \
    .set_X_test(X_test) \
    .set_y_test(y_test) \
    .set_param_grid(param_lr) \
    .set_engine(lr) \
    .build()

model_lr = engine_lr.analyze()
engine_lr.show_performance()

t = str(datetime.timedelta(seconds=time.time() - start_time)).split(':')
print("--- %s minutes, %.2f seconds ---" % (t[1], float(t[2])))


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

engine_lr.fpr

Visualizer.group_plot_roc_curve('ROC Curve of Logistic Regression', [
    (engine_lr.fpr, engine_lr.tpr, 'Logistic Regression')
])

In [None]:
param_lr = {
    'l1_ratio': [0, *np.logspace(-3, 0, 5)],
    'C': sorted(np.logspace(-4, -1, 50)),
    'max_iter': np.arange(10,80,40),
}

lr = LogisticRegression(random_state=seed,
                        penalty='elasticnet',
                        solver='saga',
                        multi_class='ovr',
                        warm_start=False,
                        n_jobs=allocated_cpu,
)

loss_accuracy_matrix = calculate_grid_performance(X_train, y_train, X_test, y_test, param_lr, lr)

In [None]:
Visualizer.plot_performance(data=loss_accuracy_matrix,
                    legend_type_name='l1_ratio',
                    x_axis_name='C',
                    upper_y_label='loss',
                    lower_y_label='auc',
                    title='Loss& Accuracy - Logistic Regression'
)

In [None]:
Visualizer.plot_importance_trending(X_train, loss_accuracy_matrix, 'Weight change on each feature', 40)