In [1]:
import os

import findspark

findspark.init('/opt/cloudera/parcels/SPARK2/lib/spark2/')
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession


In [2]:
spark = SparkSession.builder. \
    config('spark.app.name', '[Website Quality] - Inspect disproportion ratio'). \
    config('spark.dynamicAllocation.enabled', 'true'). \
    config('spark.dynamicAllocation.maxExecutors', '50'). \
    config('spark.dynamicAllocation.executorIdleTimeout', '30s'). \
    config('spark.driver.maxResultSize', '8g'). \
    config('spark.driver.memory', '50g'). \
    config('spark.executor.memory', '10g'). \
    config('spark.task.maxFailures', '3'). \
    config('spark.yarn.am.memory', '50g'). \
    config('spark.yarn.max.executor.failures', '3'). \
    config('spark.kryoserializer.buffer.max', '1024m'). \
    config('spark.yarn.executor.memoryOverhead', '50g'). \
    getOrCreate()
sc = spark.sparkContext
spark_sql = SQLContext(sc)

def load_dataset(spark, path, name):
    return spark.read.parquet(path).registerTempTable(name)

load_dataset(spark, '/user/jjian03/WebResourceQuality.parquet', 'web_resource_quality')
load_dataset(spark, '/user/jjian03/WebResourceQuality_pmid.parquet', 'web_resource_quality_pmid')
load_dataset(spark, '/datasets/MAG_20200403/MAG_Azure_Parquet/mag_parquet/Papers.parquet', 'Paper')
load_dataset(spark, '/user/lliang06/icon/MAG_publication_features.parquet', 'mag')


In [3]:
raw_data = spark_sql.sql(f'''
        SELECT wr.label as Label, count(wr.id) as Cnt 
        FROM web_resource_quality wr
        JOIN web_resource_quality_pmid wr_doi ON wr.id = wr_doi.id
        JOIN Paper p ON wr_doi.doi = p.doi
        JOIN mag m ON p.paperId = m.paperId
        WHERE wr.label IS NOT NULL
        AND wr.label IN (0, 1)
        AND isNaN(wr.label) = false
        AND wr.first_appear IS NOT NULL
        AND isNaN(wr.first_appear) = false
        AND lower(wr.url) NOT LIKE "%doi.org%"
        GROUP BY wr.label
    ''') \
    .toPandas()


In [4]:
raw_data.loc[:,'perc']= raw_data.Cnt/raw_data.Cnt.sum()
raw_data


Unnamed: 0,Label,Cnt,perc
0,1,1778130,0.904665
1,0,187382,0.095335
