# Source Code Analysis

## Initialization

In [1]:
import os
import sys

def add_path(path):
    if path not in sys.path:
        sys.path.insert(0, path)
        sys.path.append(path)    
add_path('/home/jjian03/anaconda3/lib/python3.7/site-packages')
add_path(f'{os.path.abspath(os.path.join("."))}/lib')


### Load Data

In [2]:
from lib.Repository import *
from lib.Utility import *
from lib.modeling import *
from lib.preprocessing import *
from lib.preprocessing.HTMLParser import html_parser
from lib.viz import *

Allocated 16 CPUs


In [3]:
import time
import datetime
start_time = time.time()
raw_data = DataSource(job_name='generate_truncated_data', cache_name='trunc_data.csv', truncated=True, fract=.04).raw_data

raw_data.info()

print(f'raw_data: {shape(raw_data)}')

t = str(datetime.timedelta(seconds=time.time() - start_time)).split(':')
print("--- %s minutes, %.2f seconds ---" % (t[1], float(t[2])))

Sample Size - raw_data: 14673
Initialized
<class 'pandas.core.frame.DataFrame'>
Int64Index: 14673 entries, 0 to 14672
Data columns (total 36 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   id                                      14673 non-null  object 
 1   url                                     14673 non-null  object 
 2   actual_scrape_url                       14673 non-null  object 
 3   first_appear                            14673 non-null  int64  
 4   first_available_timestamp               0 non-null      float64
 5   last_available_timestamp                14673 non-null  int64  
 6   header                                  14673 non-null  object 
 7   html_text                               14673 non-null  object 
 8   comment                                 14673 non-null  object 
 9   from_waybackmachine                     14673 non-null  int64  
 10  http_status_code

### Train Test Split

#### 6th Edition - Combine suffix dummy with MAG

In [4]:
from feature_engine import categorical_encoders


pipe = Pipeline([
    ('label_builder', TobitLabelBuilder()),
    ('url_parser', URLParser()),
    ('url_length_counter', URLLengthCounter()),
    ('url_depth_counter', URLDepthCounter()),
    ('has_www_converter', HasWWWConverter()),
    ('subdomain_level_counter', SubdomainLevelCounter()),
    ('request_parameter_counter', RequestParameterCounter()),
    ('domain_suffix_builder', DomainSuffixBuilder()),
    ('incorrect_domain_url_cleaner', IncorrectDomainUrlCleaner()),
    ('column_renamer', ColumnRenamer({'scheme': 'protocol_type'})),
    ('binary_na_encoder', BinaryNAEncoder(['content_type'])),
    ('html_parser', html_parser),
    ('binary_feature_converter', FeatureValueMapper('protocol_type', {
                                        'http': 1,
                                        'https':0,
                                        })),

    ('nan_to_Zero_converter', NanToZeroConverter([
        'total_num_of_paper_citing',
        'total_num_of_author_citing',
        'total_num_of_affiliation_citing',
        'total_num_of_journal_citing',
        'total_num_of_author_self_citation',
        'total_num_of_affiliation_self_citation',
        'total_num_of_journal_self_citation',
        'avg_year',
        'min_year',
        'max_year',
        'median',
        'num_of_author',
        'num_of_author_citing',
        'num_of_affiliation_citing',
        'num_of_journal_citing',
        'avg_hindex',
        'first_author_hindex',
        'last_author_hindex',
        'avg_mid_author_hindex',
        'paper_unique_affiliation'
    ])),
    
    ('feature_picker', FeaturePicker([
                                        'protocol_type',
                                        'url_depth',
                                        'has_www',
                                        'subdomain_level',
                                        'param_cnt',
                                        'suffix_idx',
                                        'is_port_access',
                                        'code_size',
                                        'title_length',
                                        'internal_js_cnt',
                                        'external_js_cnt',
                                        'charset',
                                        'is_html5',
                                        'has_iframe',
                                        'hyperlink_cnt',
                                        'first_appear',

                                        'total_num_of_paper_citing',
                                        'total_num_of_author_citing',
                                        'total_num_of_affiliation_citing',
                                        'total_num_of_journal_citing',
                                        'total_num_of_author_self_citation',
                                        'total_num_of_affiliation_self_citation',
                                        'total_num_of_journal_self_citation',
                                        'avg_year',
                                        'min_year',
                                        'max_year',
                                        'median',
                                        'num_of_author',
                                        'num_of_author_citing',
                                        'num_of_affiliation_citing',
                                        'num_of_journal_citing',
                                        'avg_hindex',
                                        'first_author_hindex',
                                        'last_author_hindex',
                                        'avg_mid_author_hindex',
                                        'paper_unique_affiliation',

                                        'label',
                                       ])),
    ('dummy_suffix_descritizer', DummySuffixDescritizer()),

    ('feature_remover', FeatureRemover([
                                        'is_port_access',
                                       ])),
    ('frequency_indexer', categorical_encoders.CountFrequencyCategoricalEncoder(
        encoding_method='frequency',
        variables=['charset'])),
    ('standard_scaler', TobitCustomizedStandardizer(norm='l2')),

])

pipe.fit_transform(DataSource().raw_data).to_csv('trunc_data_cleaned.csv')


In [None]:
def _map_to_pandas(rdds):
    """ Needs to be here due to pickling issues """
    return [pd.DataFrame(list(rdds))]

def toPandas(df, n_partitions=None):
    """
    Returns the contents of `df` as a local `pandas.DataFrame` in a speedy fashion. The DataFrame is
    repartitioned if `n_partitions` is passed.
    :param df:              pyspark.sql.DataFrame
    :param n_partitions:    int or None
    :return:                pandas.DataFrame
    """
    if n_partitions is not None: df = df.repartition(n_partitions)
    df_pand = df.rdd.mapPartitions(_map_to_pandas).collect()
    df_pand = pd.concat(df_pand)
    df_pand.columns = df.columns
    return df_pand

def load_dataset(spark, path, name):
    return spark.read.parquet(path).registerTempTable(name)


spark = SparkSession.builder. \
                config('spark.app.name', 'test_batch_pandas_export'). \
                config('spark.dynamicAllocation.enabled', 'true'). \
                config('spark.dynamicAllocation.maxExecutors', '50'). \
                config('spark.dynamicAllocation.executorIdleTimeout', '30s'). \
                config('spark.driver.maxResultSize', '8g'). \
                config('spark.driver.memory', '50g'). \
                config('spark.executor.memory', '10g'). \
                config('spark.task.maxFailures', '3'). \
                config('spark.yarn.am.memory', '50g'). \
                config('spark.yarn.max.executor.failures', '3'). \
                config('spark.kryoserializer.buffer.max', '1024m'). \
                config('spark.yarn.executor.memoryOverhead', '50g'). \
                getOrCreate()
sc = spark.sparkContext
spark_sql = SQLContext(sc)

load_dataset(spark, '/user/jjian03/WebResourceQuality.parquet', 'web_resource_quality')
load_dataset(spark, '/user/jjian03/WebResourceQuality_pmid.parquet', 'web_resource_quality_pmid')
load_dataset(spark, '/datasets/MAG_20200403/MAG_Azure_Parquet/mag_parquet/Papers.parquet', 'Paper')
load_dataset(spark, '/user/lliang06/icon/MAG_publication_features.parquet', 'mag')

seed=77
fract=0.00003

raw_data = spark_sql.sql(f'''
        SELECT wr.id
            , wr.url
            , wr.actual_scrape_url
            , wr.first_appear
            , wr.first_available_timestamp
            , wr.last_available_timestamp
            , wr.header
            , wr.html_text
            , wr.comment
            , wr.from_waybackmachine
            , wr.http_status_code
            , wr.original_check_failure
            , wr.original_check_error_log
            , wr.terminate_reason
            , wr.terminate_reason_error_log

            , m.paperId
            , m.total_num_of_paper_citing
            , m.total_num_of_author_citing
            , m.total_num_of_affiliation_citing
            , m.total_num_of_journal_citing
            , m.total_num_of_author_self_citation
            , m.total_num_of_affiliation_self_citation
            , m.total_num_of_journal_self_citation
            , m.avg_year
            , m.min_year
            , m.max_year
            , m.median
            , m.num_of_author
            , m.num_of_author_citing
            , m.num_of_affiliation_citing
            , m.num_of_journal_citing
            , m.avg_hindex
            , m.first_author_hindex
            , m.last_author_hindex
            , m.avg_mid_author_hindex
            , m.paper_unique_affiliation

        FROM web_resource_quality wr
        JOIN web_resource_quality_pmid wr_doi ON wr.id = wr_doi.id
        JOIN Paper p ON wr_doi.doi = p.doi
        JOIN mag m ON p.paperId = m.paperId
        WHERE wr.label IS NOT NULL
        AND wr.label IN ("0", "1")
        AND isNaN(wr.label) = false
        AND wr.first_appear IS NOT NULL
        AND isNaN(wr.first_appear) = false
        AND lower(wr.url) NOT LIKE "%doi.org%"
    ''') \
    .orderBy(fn.rand(seed=seed)) \
    .sample(False, fract, seed)

In [None]:
raw_data

In [None]:
toPandas(raw_data, 20)

In [None]:
from typing import Iterator

import pandas as pd

from pyspark.sql.functions import pandas_udf

# spark = SparkSession.builder. \
#                 config('spark.app.name', 'elastic_net_reg'). \
#                 config('spark.dynamicAllocation.enabled', 'true'). \
#                 config('spark.dynamicAllocation.maxExecutors', '50'). \
#                 config('spark.dynamicAllocation.executorIdleTimeout', '30s'). \
#                 config('spark.driver.maxResultSize', '8g'). \
#                 config('spark.driver.memory', '50g'). \
#                 config('spark.executor.memory', '10g'). \
#                 config('spark.task.maxFailures', '3'). \
#                 config('spark.yarn.am.memory', '50g'). \
#                 config('spark.yarn.max.executor.failures', '3'). \
#                 config('spark.kryoserializer.buffer.max', '1024m'). \
#                 config('spark.yarn.executor.memoryOverhead', '50g'). \
#                 getOrCreate()
print(spark)

pdf = pd.DataFrame([1, 2, 3], columns=["x"])
df = spark.createDataFrame(pdf)

# Declare the function and create the UDF
@pandas_udf("long")
def plus_one(iterator: Iterator[pd.Series]) -> Iterator[pd.Series]:
    for x in iterator:
        yield x + 1

df.select(plus_one("x")).show()