In [1]:
import random
from pyspark import SparkContext, SparkConf
from pyspark.sql import functions as funcs
from pyspark.sql import SQLContext
from pyspark import Row
import os
from pyspark.sql.types import StringType, StructType, StructField, LongType, IntegerType, MapType, ArrayType

os.environ["SPARK_HOME"] = "/usr/hdp/current/spark-client"
conf = (SparkConf()
    .setMaster("yarn-client")
    .setAppName("Max' Data Checker")
    .set("spark.deploy-mode", "cluster")
       .set("spark.executor.mempory", "20g")
       .set("spark.yarn.executor.memoryOverhead", "16000")
       .set("spark.executor.cores", "2")
       .set("spark.executor.instances", "3"))

sc = SparkContext(conf=conf)
sqlc = SQLContext(sc)

In [2]:
data = sqlc.read.parquet('/tdg/2017/09/12/paths_krauthausen')

In [3]:
#data.printSchema()
nodes_udf = funcs.udf(lambda x: [e[0] for e in x], ArrayType(LongType()))
data = data.withColumn('nodes', nodes_udf(data['Trajectory']))

node_list = {
    124058897, 124059039, 160331013, 160331076, 1621251869, 203900652, 2138017204, 254286933,
    270561097, 270561110, 287036644, 287036660, 287037232, 287037232, 287037249, 3261906723,
    4060138780, 4755922518, 4755922520, 4755922622, 485951, 573644828, 95732362, 95732395, 95732400
}

def intersect(column):
    intersec = set(column).intersection(node_list)
    if len(intersec) > 0:
        return intersec.pop()
    return 0

visit_udf = funcs.udf(intersect, LongType())
data = data.withColumn('junction', visit_udf(data['nodes']))

In [4]:
data.groupby('junction').count().show(30, False)

+----------+-------+
|junction  |count  |
+----------+-------+
|287037232 |17130  |
|287036644 |855    |
|287037249 |3343   |
|203900652 |1      |
|1621251869|2      |
|160331076 |84     |
|124058897 |1881   |
|270561110 |37     |
|485951    |26     |
|95732362  |77     |
|0         |1508817|
|95732400  |6178   |
|160331013 |12     |
|3261906723|18641  |
+----------+-------+



In [None]:
# junctions
a4_38 = [1683859870, 1683859885, 1683859934, 1683859935, 17189158, 17189164, 2001345877, 2692662933, 32465907, 3259760022, 3868237655, 424137325, 4258060190, 484850, 484852, 484857]
a4_39 = [302786121, 310275243, 550276846, 601661588]
a4_40a = [308852506, 308852512, 3260363980, 3260363984, 425458183, 425458298, 425458369]

In [5]:
via_df = sqlc.read.parquet('/tdg/2018/custom_aggregations/odm_result/custom_odm_result_application_1534866846680_0064')

In [7]:
via_df.show(60, False)

+---------+-------+
|location |count  |
+---------+-------+
|entry_32 |16145.0|
|entry_33 |6980.0 |
|entry_34 |12825.0|
|entry_35 |4805.0 |
|entry_36 |14660.0|
|entry_56a|43415.0|
|entry_37 |4160.0 |
|entry_38 |7837.5 |
|entry_56b|7300.0 |
|entry_39 |11027.5|
|exit_56a |41087.5|
|exit_56b |8915.0 |
|exit_32  |19092.5|
|exit_33  |9860.0 |
|exit_34  |8860.0 |
|exit_35  |3837.5 |
|exit_36  |18860.0|
|entry_42 |12870.0|
|exit_37  |6242.5 |
|entry_43 |3217.5 |
|exit_38  |8867.5 |
|entry_44 |7602.5 |
|exit_39  |9505.0 |
|entry_45 |15932.5|
|entry_46 |3897.5 |
|entry_48 |5962.5 |
|entry_49 |12555.0|
|exit_42  |14585.0|
|entry_40a|8230.0 |
|exit_43  |3542.5 |
|exit_44  |4612.5 |
|entry_40b|5157.5 |
|exit_45  |21585.0|
|entry_50 |10025.0|
|entry_51 |8910.0 |
|exit_46  |4887.5 |
|entry_52 |4820.0 |
|entry_53 |9397.5 |
|exit_48  |6910.0 |
|entry_54 |17277.5|
|exit_40a |4045.0 |
|exit_49  |10687.5|
|entry_55 |11995.0|
|exit_40b |3732.5 |
|entry_58a|8635.0 |
|entry_57 |5725.0 |
|entry_58b|6357.5 |
