# Data Patch

In [1]:
import sys
sys.path.insert(0, '/home/jjian03/anaconda3/lib/python3.7/site-packages')

## Read records line by line

In [2]:
import json
import numpy as np
import pandas as pd
from datetime import datetime
import gc


class Formatter:
    @staticmethod
    def get_timestamp(format="%Y/%m/%d %H:%M:%S"):
        return str((datetime.now().strftime(format)))

def load_json(buffer, path, buffer_size=5000):
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            buffer.append(parse_json(json.loads(line)))
            if (len(buffer)+1) % buffer_size == 0:
                df = integrate_dataframe(buffer)
                buffer.clear()
                yield df
                gc.collect()

        if len(buffer) > 0:
            df = integrate_dataframe(buffer)
            buffer.clear()
            yield df

def safe_get_attr(json_obj, key):
    return json_obj[key] if key in json_obj else None

def parse_json(json_obj):
    _id = [v for k, v in safe_get_attr(json_obj, '_id').items()][0]
    df = pd.DataFrame({
        'id': _id

        , 'url': safe_get_attr(json_obj, 'url')
        , 'actual_scrape_url': safe_get_attr(json_obj, 'actual_scrape_url')

        , 'first_appear': safe_get_attr(json_obj, 'first_appear')
        , 'first_available_timestamp': safe_get_attr(json_obj, 'first_available_timestamp')
        , 'last_available_timestamp': safe_get_attr(json_obj, 'last_available_timestamp')

        , 'header': safe_get_attr(json_obj, 'header')
        , 'html_text': safe_get_attr(json_obj, 'html_text')
        , 'comment': safe_get_attr(json_obj, 'comment')

        , 'from_waybackmachine': safe_get_attr(json_obj, 'from_waybackmachine')
        , 'http_status_code': safe_get_attr(json_obj, 'http_status_code')

        , 'original_check_failure': safe_get_attr(json_obj, 'original_check_failure')
        , 'original_check_error_log': safe_get_attr(json_obj, 'original_check_error_log')
        , 'terminate_reason': safe_get_attr(json_obj, 'terminate_reason')
        , 'terminate_reason_error_log': safe_get_attr(json_obj, 'terminate_reason_error_log')

        , 'label': safe_get_attr(json_obj, 'label')
    }, index=[_id])

    # Type conversion for numerical columns
    for col_name in ['first_appear'
                     , 'first_available_timestamp'
                     , 'last_available_timestamp'
                     , 'from_waybackmachine'
                     , 'http_status_code'
                     , 'label'
                    ]:
        df.loc[:, col_name]= df[col_name].fillna(np.nan).astype(float)

    for col_name in ['id'
                     , 'url'
                     , 'actual_scrape_url'
                     , 'header'
                     , 'html_text'
                     , 'comment'
                     , 'original_check_failure'
                     , 'original_check_error_log'
                     , 'terminate_reason'
                     , 'terminate_reason_error_log'
                    ]:
        df.loc[:, col_name] = df[col_name].astype(str)
    return df

def integrate_dataframe(buffer):
    df = pd.DataFrame()
    for row in buffer:
        df = df.append(row)
    return df


In [3]:
json_path = '/home/jjian03/data/website_quality_repository.json'

buffer = list()
for item in load_json(buffer, json_path):
    display(item.head())
    break

Unnamed: 0,id,url,actual_scrape_url,first_appear,first_available_timestamp,last_available_timestamp,header,html_text,comment,from_waybackmachine,http_status_code,original_check_failure,original_check_error_log,terminate_reason,terminate_reason_error_log,label
5ecd87d6150a1889d7f738cd,5ecd87d6150a1889d7f738cd,https://us.sagepub.com/en-us/nam/open-access-a...,https://us.sagepub.com/en-us/nam/open-access-a...,,,,"{""connection"": ""keep-alive"", ""content-length"":...","<!doctype html>\n<html xmlns=""http://www.w3.or...",,0,200.0,,,,,1.0
5ecd87d6150a1889d7f738ce,5ecd87d6150a1889d7f738ce,http://bioconductor.org/,http://bioconductor.org/,,,20200610000000.0,"{""content-type"": ""text/html"", ""content-length""...","<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...",,0,200.0,,,,,1.0
5ecd87d6150a1889d7f738cf,5ecd87d6150a1889d7f738cf,http://www.r-project.org/,http://www.r-project.org/,,,,"{""date"": ""wed, 27 may 2020 04:28:28 gmt"", ""ser...","<!DOCTYPE html>\n<html lang=""en"">\n <head>\n ...",,0,200.0,,,,,1.0
5ecd87d6150a1889d7f738d0,5ecd87d6150a1889d7f738d0,https://earray.chem.agilent.com/earray/,http://web.archive.org/web/20200420043336/http...,2009.0,,20200420000000.0,"{""server"": ""nginx/1.15.8"", ""date"": ""thu, 28 ma...","<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...",HTTPSConnectionPool(host='earray.chem.agilent....,0,200.0,exception,ReadTimeout,,,0.0
5ecd87d6150a1889d7f738d1,5ecd87d6150a1889d7f738d1,http://www.partek.com/?q=partekgs,http://www.partek.com/?q=partekgs,,,,"{""server"": ""nginx/1.14.0 (ubuntu)"", ""date"": ""t...","<!DOCTYPE html>\n<!--[if IE 8 ]> <html lang=""e...",,0,200.0,,,,,1.0


## Connect to Spark

In [4]:
import findspark
findspark.init('/opt/cloudera/parcels/SPARK2-2.3.0.cloudera3-1.cdh5.13.3.p0.458809/lib/spark2/')
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext

try:
    print(spark.version)
except NameError as e:
    from pyspark.sql import SparkSession
    spark = SparkSession.builder.\
        config('spark.app.name', 'mongodb_migration').\
        config('spark.dynamicAllocation.enabled','true').\
        config('spark.dynamicAllocation.maxExecutors','50').\
        config('spark.dynamicAllocation.executorIdleTimeout','30s').\
        config('spark.driver.maxResultSize', '8g').\
        config('spark.driver.memory', '50g').\
        config('spark.executor.memory', '20g').\
        config('spark.task.maxFailures', '3').\
        config('spark.yarn.am.memory', '8g').\
        config("spark.rpc.message.maxSize", "1024"). \
        config('spark.yarn.max.executor.failures', '3').\
        config('spark.kryoserializer.buffer.max','1024m').\
        config('spark.yarn.executor.memoryOverhead', '50g').\
        getOrCreate()
    print(spark.version)
sc = spark.sparkContext
spark_sql = SQLContext(sc)


2.3.0.cloudera3


### Test Connection

In [5]:
def load_dataset(spark, path, name):
    spark.read.parquet(path).registerTempTable(name)    
    
mag = load_dataset(spark, '/user/lliang06/icon/MAG_publication_features.parquet', 'mag')
try:
    load_dataset(spark, '/user/jjian03/WebResourceQuality.parquet', 'WebResourceQuality')
except:
    pass

spark_sql.sql('''
    SELECT *
    FROM mag
''').limit(10).toPandas()

Unnamed: 0,PaperId,total_num_of_paper_citing,total_num_of_author_citing,total_num_of_affiliation_citing,total_num_of_journal_citing,total_num_of_author_self_citation,total_num_of_affiliation_self_citation,total_num_of_journal_self_citation,avg_year,min_year,...,median,num_of_author,num_of_author_citing,num_of_affiliation_citing,num_of_journal_citing,avg_hindex,first_author_hindex,last_author_hindex,avg_mid_author_hindex,paper_unique_affiliation
0,4364,,,,,,,,,,...,,,,,,,,,,
1,12793,,,,,,,,,,...,,,,,,1.0,,,0.0,
2,12793,,,,,,,,,,...,,,,,,1.0,,,0.0,
3,21527,,,,,,,,,,...,,,,,,4.0,2.0,2.0,,
4,21527,,,,,,,,,,...,,,,,,4.0,2.0,2.0,,
5,21757,,,,,,,,,,...,,,,,,3.8,14.0,14.0,1.0,
6,21757,,,,,,,,,,...,,,,,,3.8,14.0,14.0,1.0,
7,23267,,,,,,,,,,...,,,,,,1.666667,2.0,2.0,0.5,
8,23267,,,,,,,,,,...,,,,,,1.666667,2.0,2.0,0.5,
9,23758,,,,,,,,,,...,,,,,,,,,,


## Insert& Append to the Parquet File

In [6]:
from pyspark.sql.types import *
from pyspark.sql import Row


schema = StructType([
        StructField("id", StringType(), True),

        # Destination
        StructField("url", StringType(), True),
        StructField("actual_scrape_url", StringType(), True),

        # Chronological Data
        StructField("first_appear", DoubleType(), True),
        StructField("first_available_timestamp", DoubleType(), True),
        StructField("last_available_timestamp", DoubleType(), True),

        # Content
        StructField("header", StringType(), True),
        StructField("html_text", StringType(), True),
        StructField("comment", StringType(), True),

        # Status
        StructField("from_waybackmachine", IntegerType(), True),
        StructField("http_status_code", DoubleType(), True),

        # Trace
        StructField("original_check_failure", StringType(), True),
        StructField("original_check_error_log", StringType(), True),
        StructField("terminate_reason", StringType(), True),
        StructField("terminate_reason_error_log", StringType(), True),

        # Label
        StructField("label", IntegerType(), True),
])
schema_id = StructType([StructField("id", StringType(), True)])


In [None]:
buffer = list()
cnt = 0
for item in load_json(buffer, json_path):
    # Insert
    try: 
        item.loc[:, 'from_waybackmachine'] = item.from_waybackmachine.astype(int)
    except ValueError as e:
        item = item[item.from_waybackmachine.isna().apply(lambda x: not x)]
        item.loc[:, 'from_waybackmachine'] = item.from_waybackmachine.astype(int)
    try: 
        item.loc[:, 'label'] = item.label.astype(int)
    except ValueError as e:
        item = item[item.label.isna().apply(lambda x: not x)]
        item.loc[:, 'label'] = item.label.astype(int)

    if len(item) == 0:
        continue
    df_spark = spark_sql.createDataFrame(item, schema)
    cnt+=len(item)
    print(f'{Formatter.get_timestamp()} - {cnt}')
    try:
        df_spark.write.mode('append').parquet('/user/jjian03/WebResourceQuality.parquet')
    except Exception as e:
        print(f'Error: {str(e)}')
        for idx, row in item.iterrows():
            df_spark = spark_sql.createDataFrame(Row(row.tolist()), schema)
            df_spark.write.mode('append').parquet('/user/jjian03/WebResourceQuality.parquet')

2020/06/16 12:42:59 - 4999
2020/06/16 12:45:09 - 9998
2020/06/16 12:47:22 - 14997
2020/06/16 12:49:46 - 19996
2020/06/16 12:51:56 - 24995
2020/06/16 12:54:25 - 29994
2020/06/16 12:56:48 - 34993
2020/06/16 12:59:39 - 39992
2020/06/16 13:02:12 - 44991
2020/06/16 13:04:32 - 49990
2020/06/16 13:07:20 - 54989
2020/06/16 13:09:29 - 59988
2020/06/16 13:12:32 - 64987
2020/06/16 13:14:35 - 69986


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


2020/06/16 13:16:35 - 74984
2020/06/16 13:18:39 - 79983
2020/06/16 13:20:46 - 84982
2020/06/16 13:23:17 - 89981
2020/06/16 13:25:24 - 94980
2020/06/16 13:27:32 - 99979
2020/06/16 13:29:53 - 104978
2020/06/16 13:32:04 - 109977
2020/06/16 13:34:18 - 114976
2020/06/16 13:36:27 - 119975
2020/06/16 13:38:58 - 124974
2020/06/16 13:41:10 - 129973
2020/06/16 13:43:25 - 134972
2020/06/16 13:47:28 - 139971
2020/06/16 13:49:48 - 144970
2020/06/16 13:52:00 - 149969
2020/06/16 13:54:18 - 154968
2020/06/16 13:56:26 - 159967
2020/06/16 13:59:06 - 164966
2020/06/16 14:02:09 - 169965
2020/06/16 14:04:12 - 174964
2020/06/16 14:06:19 - 179963
2020/06/16 14:08:33 - 184962
2020/06/16 14:11:32 - 189961
2020/06/16 14:15:49 - 194960
2020/06/16 14:18:11 - 199959
2020/06/16 14:20:18 - 204958
2020/06/16 14:23:06 - 209957
2020/06/16 14:25:16 - 214956
2020/06/16 14:27:26 - 219955
2020/06/16 14:29:39 - 224954
2020/06/16 14:33:17 - 229953
2020/06/16 14:36:19 - 234952
2020/06/16 14:38:57 - 239951
2020/06/16 14:41:46 

In [None]:
return
from concurrent.futures.thread import ThreadPoolExecutor


buffer = list()

def task_bulider(item):
    df = item
    def _task():
        try:
            print(f'Task inner - {df}')
        except Exception as e:
            print(e)
        try: 
            df.loc[:, 'from_waybackmachine'] = df.from_waybackmachine.astype(int)
        except ValueError as e:
            df = df[df.from_waybackmachine.isna().apply(lambda x: not x)]
            df.loc[:, 'from_waybackmachine'] = df.from_waybackmachine.astype(int)
        try: 
            df.loc[:, 'label'] = df.label.astype(int)
        except ValueError as e:
            df = df[df.label.isna().apply(lambda x: not x)]
            df.loc[:, 'label'] = df.label.astype(int)

        if len(df) == 0:
            return
        df_spark = spark_sql.createDataFrame(df, schema)

        try:
            df_spark.write.mode('append').parquet('/user/jjian03/WebResourceQuality.parquet')
        except Exception as e:
            print(f'Error: {str(e)}')
            for idx, row in df.iterrows():
                df_spark = spark_sql.createDataFrame(Row(row.tolist()), schema)
                df_spark.write.mode('append').parquet('/user/jjian03/WebResourceQuality.parquet')
    return _task

with ThreadPoolExecutor(max_workers=8) as executor:
    for item in load_json(buffer, json_path):
        # Insert
        df = item.copy()
        task = task_bulider(df)
        executor.submit(task)


In [None]:
spark_sql.sql(f'''
        SELECT id
        FROM WebResourceQuality
        WHERE id IN ({ids})
    ''').limit(10).toPandas()

In [None]:
WebResourceQuality = load_dataset(spark, '/user/jjian03/WebResourceQuality.parquet', 'WebResourceQuality')

spark_sql.sql('''
    SELECT *
    FROM WebResourceQuality
''').limit(10).toPandas()

In [None]:
load_dataset(spark, '/datasets/MAG_20200403/MAG_Azure_Parquet/mag_parquet/Papers.parquet', 'Papers')

spark_sql.sql('''
    SELECT *
    FROM Papers
''').limit(10).toPandas()