# Feature Extraction

## Create spark session

In [None]:
import findspark
findspark.init('/opt/cloudera/parcels/SPARK2-2.3.0.cloudera3-1.cdh5.13.3.p0.458809/lib/spark2/')
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
try:
    print(spark.version)
except NameError as e:
    from pyspark.sql import SparkSession
    spark = SparkSession.builder.\
        config('spark.app.name', '').\
        config('spark.dynamicAllocation.enabled','true').\
        config('spark.dynamicAllocation.maxExecutors','50').\
        config('spark.dynamicAllocation.executorIdleTimeout','30s').\
        config('spark.driver.maxResultSize', '8g').\
        config('spark.driver.memory', '8g').\
        config('spark.executor.memory', '4g').\
        config('spark.task.maxFailures', '3').\
        config('spark.yarn.am.memory', '8g').\
        config('spark.yarn.max.executor.failures', '3').\
        config('spark.kryoserializer.buffer.max','1024m').\
        getOrCreate()
    print(spark.version)
sc = spark.sparkContext
spark_sql = SQLContext(sc)

## Load Dataset

In [None]:
def load_dataset(spark, path, name):
    spark.read.parquet(path).registerTempTable(name)    
    
load_dataset(spark, '/datasets/MAG_20200403/MAG_Azure_Parquet/mag_parquet/Papers.parquet', 'Paper')
load_dataset(spark, '/datasets/MAG_20200403/MAG_Azure_Parquet/mag_parquet/PaperReferences.parquet', 'PaperReference')
load_dataset(spark, '/datasets/MAG_20200403/MAG_Azure_Parquet/mag_parquet/AuthorAffiliation.parquet', 'PaperAuthorAffiliation')
# load_dataset(spark, '/datasets/MAG_20200403/MAG_Azure_Parquet/mag_parquet/Authors.parquet', 'Author')
# load_dataset(spark, '/datasets/MAG_20200403/MAG_Azure_Parquet/mag_parquet/Affiliations.parquet', 'Affiliation')


## Extract Features

In [None]:
import os


user_dir = '/user/jjian03'
file_name = 'PaperReferCount.parquet'
file_dir = os.path.join(user_dir, file_name)


spark_sql.sql('''
    WITH paper_meta AS (
        SELECT 
            p.PaperId AS paper_id 
            , count(p_auth_org.AuthorId) AS author_cnt 
            , count(DISTINCT p_auth_org.AffiliationId) AS org_cnt 
        FROM Paper p
        LEFT JOIN PaperAuthorAffiliation p_auth_org ON p_auth_org.PaperId = p.PaperId
        GROUP BY p.PaperId
    )
    SELECT 
        orig.PaperId as paper_id 
        , sum(ref_source.org_cnt) as cnt_org_cited 
        , count(ref_source.paper_id) as cnt_paper_cited 
        , sum(ref_source.author_cnt) as cnt_author_cited 
    FROM Paper orig
    LEFT JOIN PaperReference ref ON orig.PaperId = ref.PaperId
    LEFT JOIN paper_meta ref_source ON ref.PaperReferenceId = ref_source.paper_id
    GROUP BY orig.PaperId
''') \
.write.mode(SaveMode.Overwrite).parquet(file_dir)
# .limit(1).toPandas()

print(f'Success! Dir -> {file_dir}')


In [None]:
load_dataset(spark, '/user/jjian03/PaperReferCount.parquet', 'PaperReferCount')

In [None]:
spark_sql.sql('''
    SELECT *
    FROM PaperReferCount
''').limit(10).toPandas()