In [1]:
import random
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import functions as funcs
from pyspark.sql import SQLContext
from pyspark import Row
import os
from pyspark.sql.types import StringType, StructType, StructField, LongType, IntegerType, MapType, FloatType
from pyspark import accumulators

os.environ["SPARK_HOME"] = "/usr/hdp/current/spark-client"
conf = SparkConf()
conf.setMaster('yarn-client')
conf.setAppName('Max Data Checker')
conf.set("spark.executor.memory", "15g")
conf.set("spark.driver.memory", "40g")
conf.set("spark.driver.maxResultSize", "40g")
conf.set("spark.executor.cores", "2")
conf.set("spark.executor.instances", "5")

try:
    sc.stop()
except:
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)

In [None]:
df = sqlContext.read.parquet('/tdg/2018/custom_aggregations/odm_result/custom_odm_result_application_1534866846680_0078')

In [None]:
import pyspark.sql.types as T
df = df.withColumn('locs', funcs.udf(lambda x: x.split(';'), T.ArrayType(StringType()))(df['location']))
df = df.withColumn('OD', funcs.udf(lambda x: str(x[0])[-3:] == str(x[-1])[-3:], T.BooleanType())(df['locs']))
df = df.withColumn('en_ex', funcs.udf(lambda x: str(x[0])[:2] == 'en' and str(x[-1])[:2] == 'ex', T.BooleanType())(df['locs']))
df = df.withColumn('len', funcs.size(df['locs']))

In [None]:
print 'All round trips: '
df.groupby('OD').sum().show()

In [None]:
df.show()

In [50]:
print 'Entry->exit trips:'
df.groupby('en_ex').sum().show()

Entry->exit trips:
+-----+----------+--------+
|en_ex|sum(count)|sum(len)|
+-----+----------+--------+
| true|   95617.5|   49750|
|false|  140940.0|   78275|
+-----+----------+--------+



In [51]:
print 'Without exit->entry round trips: '
df.groupby(df['OD'] & df['en_ex']).sum().show()

Without exit->entry round trips: 
+-------------+----------+--------+
|(OD && en_ex)|sum(count)|sum(len)|
+-------------+----------+--------+
|         true|   10240.0|    4013|
|        false|  226317.5|  124012|
+-------------+----------+--------+



In [52]:
df.where('len < 2').groupby('len').sum().show()
df.where('len = 2').groupby('len').sum().show()
df.where('len > 2').groupby().sum().select('sum(count)').show()

+---+----------+--------+
|len|sum(count)|sum(len)|
+---+----------+--------+
|  1|   61530.0|      76|
+---+----------+--------+

+---+----------+--------+
|len|sum(count)|sum(len)|
+---+----------+--------+
|  2|   71850.0|    2704|
+---+----------+--------+

+----------+
|sum(count)|
+----------+
|  103177.5|
+----------+



In [53]:
paths = sqlContext.read.parquet('/tdg/2017/09/12/paths_krauthausen')
print paths.count()

1557084


In [54]:
df.groupby('location').sum().show()

+--------------------+----------+--------+
|            location|sum(count)|sum(len)|
+--------------------+----------+--------+
|exit_36;entry_36;...|       5.0|       4|
|entry_59;exit_61;...|       2.5|       4|
|exit_34;entry_34;...|       2.5|       5|
|entry_62;exit_63;...|       2.5|       7|
|exit_36;entry_40a...|       2.5|       9|
|exit_41a;entry_40...|       2.5|       6|
|  entry_58b;exit_58a|     150.0|       2|
|entry_49;exit_56a...|      25.0|       3|
|entry_34;exit_36;...|       2.5|       8|
|entry_62;exit_58b...|      22.5|       4|
|entry_56a;exit_51...|       2.5|       8|
|exit_36;entry_36;...|       5.0|       6|
|entry_45;exit_32;...|       2.5|       4|
|exit_33;entry_33;...|       2.5|       7|
|exit_36;entry_36;...|      10.0|       5|
|entry_56a;exit_42...|       7.5|       5|
|exit_62;entry_62;...|       2.5|       5|
|   entry_41a;exit_61|       5.0|       2|
|exit_33;entry_33;...|       2.5|      12|
|entry_56b;exit_51...|       2.5|       7|
+----------

In [55]:
df.groupby(~ df['OD'] & df['en_ex']).sum().show()

+-----------------+----------+--------+
|(NOT OD && en_ex)|sum(count)|sum(len)|
+-----------------+----------+--------+
|             true|   85377.5|   45737|
|            false|  151180.0|   82288|
+-----------------+----------+--------+



In [56]:
df.groupby(~ df['OD'] & df['en_ex'] & (df['len'] == 2)).sum().show()

+--------------------------------+----------+--------+
|((NOT OD && en_ex) && (len = 2))|sum(count)|sum(len)|
+--------------------------------+----------+--------+
|                            true|   53265.0|    1936|
|                           false|  183292.5|  126089|
+--------------------------------+----------+--------+

