In [1]:
import findspark
findspark.init('/usr/hdp/current/spark2-client')

import pyspark
from pyspark.sql.functions import lit, col, instr, expr, pow, round, bround, corr, count, mean, stddev_pop, min, max
from pyspark.sql.functions import monotonically_increasing_id, initcap, lower, upper, ltrim, rtrim, rpad, lpad, trim
from pyspark.sql.functions import regexp_replace, translate, regexp_extract, current_date, current_timestamp, struct
from pyspark.sql.functions import date_add, date_sub, datediff, months_between, to_date, to_timestamp, coalesce, split, size
from pyspark.sql.functions import array_contains, explode, udf
from pyspark.sql import HiveContext
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import col, when

from pyspark.sql.types import StructField, StructType, StringType, IntegerType, DoubleType, FloatType, LongType

from datetime import datetime,timedelta

In [2]:
def get_Spark():

    conf = pyspark.SparkConf().setAll([
        ('spark.submit.deployMode', 'client'), # deploy in yarn-client or yarn-cluster
        ('spark.executor.memory', '8g'),       # memory allocated for each executor
        ('spark.executor.cores', '3'),         # number of cores for each executor
        ('spark.executor.instances', '10'),    # number of executors in total
        ('spark.yarn.am.memory', '10g')])      # memory for spark driver (application master)
    spark = SparkSession.builder \
    .master("yarn") \
    .appName("table_generation") \
    .enableHiveSupport() \
    .config(conf = conf) \
    .getOrCreate()

    return spark

spark = get_Spark()
spark_context = spark.sparkContext
hc = HiveContext(spark_context)

In [3]:
def hive2spark(hc, query):
    spark_df = hc.sql("""{}""".format(query))
    return spark_df

In [4]:
query = """select * from guobiao_tsp_tbls.charging"""  

In [5]:
df1 = hive2spark(hc, query)

In [6]:
df1.head()

Row(ah_throughput=Decimal('24.0'), battery_temp_diff_mean=Decimal('1.1'), battery_temp_diff_p1=Decimal('1.0'), battery_temp_diff_p99=Decimal('2.0'), battery_temp_diff_std=Decimal('0.3'), battery_temp_max=32, battery_temp_mean=Decimal('30.8'), battery_temp_min=30, cell_volt_diff_mean=Decimal('0.0192'), cell_volt_diff_p1=Decimal('0.0180'), cell_volt_diff_p99=Decimal('0.0210'), cell_volt_diff_std=Decimal('0.0008'), cell_volt_max=Decimal('4.071'), cell_volt_mean=Decimal('3.9708'), cell_volt_min=Decimal('3.862'), count_records=590, delta_soc=19, duration=Decimal('1.639'), end_soc=87, end_time=datetime.datetime(2019, 7, 18, 23, 59, 55), end_ts=1563465595000, kwh_throughput=Decimal('9.5'), mean_latitude=Decimal('28.8059'), mean_longitude=Decimal('113.0365'), normalized_ah_throughput=Decimal('126.4'), normalized_duration=Decimal('8.626'), normalized_kwh_throughput=Decimal('49.9'), start_soc=68, start_time=datetime.datetime(2019, 7, 18, 22, 21, 45), start_ts=1563459705000, veh_curr_mean=Decimal

In [7]:
df1.show(5)

+-------------+----------------------+--------------------+---------------------+---------------------+----------------+-----------------+----------------+-------------------+-----------------+------------------+------------------+-------------+--------------+-------------+-------------+---------+--------+-------+-------------------+-------------+--------------+-------------+--------------+------------------------+-------------------+-------------------------+---------+-------------------+-------------+-------------+-----------+------------+------------+-----------------+----------+
|ah_throughput|battery_temp_diff_mean|battery_temp_diff_p1|battery_temp_diff_p99|battery_temp_diff_std|battery_temp_max|battery_temp_mean|battery_temp_min|cell_volt_diff_mean|cell_volt_diff_p1|cell_volt_diff_p99|cell_volt_diff_std|cell_volt_max|cell_volt_mean|cell_volt_min|count_records|delta_soc|duration|end_soc|           end_time|       end_ts|kwh_throughput|mean_latitude|mean_longitude|normalized_ah_thr

In [8]:
df2 = df1.select("day","vin","start_time","end_time","duration")

In [9]:
from pyspark.sql import functions as F

In [10]:
df2 = df2.withColumn("month",F.trunc("day","month"))
df2 = df2.drop("day")

In [11]:
df2.show(5)

+-----------------+-------------------+-------------------+--------+----------+
|              vin|         start_time|           end_time|duration|     month|
+-----------------+-------------------+-------------------+--------+----------+
|LMWHP1S56J1000480|2019-07-18 22:21:45|2019-07-18 23:59:55|   1.639|2019-07-01|
|LMWHP1S26K1009249|2019-07-18 13:13:05|2019-07-18 13:46:25|   0.558|2019-07-01|
|LMWHP1S26K1009249|2019-07-18 05:10:45|2019-07-18 06:21:45|   1.186|2019-07-01|
|LMGHP1S59J1003417|2019-07-17 18:25:25|2019-07-18 05:20:25|  10.919|2019-07-01|
|LMWHP1S24K1006463|2019-07-18 13:41:15|2019-07-18 15:03:15|   1.369|2019-07-01|
+-----------------+-------------------+-------------------+--------+----------+
only showing top 5 rows



In [12]:
df3 = df2.groupBy("month","vin").agg(expr("sum(duration)").alias("total_duration_per_month"))
df3.show(5)

+----------+-----------------+------------------------+
|     month|              vin|total_duration_per_month|
+----------+-----------------+------------------------+
|2019-07-01|LMWHP1S29K1009617|                  37.472|
|2019-07-01|LMGHP1S50J1002690|                  22.642|
|2019-07-01|LMGAJ1S24H1000945|                  27.234|
|2019-07-01|LMWHP1S86K1006811|                 147.487|
|2019-07-01|LMWHP1S80K1006142|                  41.394|
+----------+-----------------+------------------------+
only showing top 5 rows



In [13]:
df3.head()

Row(month=datetime.date(2019, 5, 1), vin=u'LMWHP1S80J1002705', total_duration_per_month=Decimal('105.386'))

In [14]:
df4 = df2.groupBy("month","vin").count()
df4.show(5)

+----------+-----------------+-----+
|     month|              vin|count|
+----------+-----------------+-----+
|2019-04-01|LMGFJ1S50H1000764|   44|
|2019-04-01|LMWHP1S81J1003913|   27|
|2019-04-01|LMGAJ1S83J1011736|   28|
|2019-04-01|LMWHP1S29J1000107|   73|
|2019-04-01|LMWHP1S8XJ1001366|   14|
+----------+-----------------+-----+
only showing top 5 rows



In [15]:
df4.head()

Row(month=datetime.date(2019, 6, 1), vin=u'LMWHP1S89J1001424', count=4)

In [16]:
df = df3.alias("a").join(df4.alias("b"),(df3.vin == df4.vin) & (df3.month == df4.month))\
.select("a.vin","a.month","a.total_duration_per_month","b.count")
df = df.withColumnRenamed("count","charging_count")

In [17]:
df.show(5)

+-----------------+----------+------------------------+--------------+
|              vin|     month|total_duration_per_month|charging_count|
+-----------------+----------+------------------------+--------------+
|LMGAJ1S20H1000487|2017-10-01|                   6.397|             2|
|LMGAJ1S20H1000604|2018-03-01|                  11.930|             7|
|LMGAJ1S20H1001140|2018-07-01|                   0.825|             1|
|LMGAJ1S20H1001493|2019-04-01|                  11.481|             9|
|LMGAJ1S20J1002438|2019-02-01|                  65.267|            32|
+-----------------+----------+------------------------+--------------+
only showing top 5 rows



In [18]:
df = df.withColumn("average_charging_duration_per_month",(F.col("total_duration_per_month") / F.col("charging_count")))

In [19]:
df.show(5)

+-----------------+----------+------------------------+--------------+-----------------------------------+
|              vin|     month|total_duration_per_month|charging_count|average_charging_duration_per_month|
+-----------------+----------+------------------------+--------------+-----------------------------------+
|LMGAJ1S20H1000487|2017-10-01|                   6.397|             2|               3.198500000000000...|
|LMGAJ1S20H1000604|2018-03-01|                  11.930|             7|               1.704285714285714...|
|LMGAJ1S20H1001140|2018-07-01|                   0.825|             1|               0.825000000000000...|
|LMGAJ1S20H1001493|2019-04-01|                  11.481|             9|               1.275666666666666...|
|LMGAJ1S20J1002438|2019-02-01|                  65.267|            32|               2.039593750000000...|
+-----------------+----------+------------------------+--------------+-----------------------------------+
only showing top 5 rows



In [20]:
query2 = """select * from ubi.guobiao_trip_complete"""  
df5 = hive2spark(hc, query2)

In [21]:
df5.head()

Row(vin=u'LMGAJ1S20H1000005', start_loc_lat=31.0247, start_loc_lon=121.699626, start_time=datetime.datetime(2019, 4, 14, 8, 57, 45), start_day=u'20190414', end_loc_lat=31.024209, end_loc_lon=121.699399, end_time=datetime.datetime(2019, 4, 14, 8, 58, 55), distance=1.0, duration=1.1666666666666667)

In [22]:
df5 = df5.select(F.to_date(F.unix_timestamp("start_day",'yyyyMMdd').cast('timestamp')).alias('day'),"vin","distance","duration")

In [23]:
df5.show(2)

+----------+-----------------+--------+------------------+
|       day|              vin|distance|          duration|
+----------+-----------------+--------+------------------+
|2019-04-14|LMGAJ1S20H1000005|     1.0|1.1666666666666667|
|2019-04-16|LMGAJ1S20H1000148|     1.0| 4.666666666666667|
+----------+-----------------+--------+------------------+
only showing top 2 rows



In [24]:
df5 = df5.withColumn("month",F.trunc("day","month"))
df5 = df5.drop("day")

In [25]:
df5.show(2)

+-----------------+--------+------------------+----------+
|              vin|distance|          duration|     month|
+-----------------+--------+------------------+----------+
|LMGAJ1S20H1000005|     1.0|1.1666666666666667|2019-04-01|
|LMGAJ1S20H1000148|     1.0| 4.666666666666667|2019-04-01|
+-----------------+--------+------------------+----------+
only showing top 2 rows



In [26]:
df6 = df5.groupBy("month","vin").agg(expr("sum(duration)").alias("total_trip_duration_per_month"))
df6.show(5)

+----------+-----------------+-----------------------------+
|     month|              vin|total_trip_duration_per_month|
+----------+-----------------+-----------------------------+
|2018-12-01|LMGAJ1S24J1007013|           1792.9999999999998|
|2018-12-01|LMGAJ1S27J1004378|           2470.8333333333335|
|2018-12-01|LMGAJ1S28J1002557|            3779.166666666667|
|2018-12-01|LMGAJ1S29J1002888|            4914.166666666666|
|2018-12-01|LMGAJ1S88J1003101|           1884.6666666666663|
+----------+-----------------+-----------------------------+
only showing top 5 rows



In [27]:
df7 = df5.groupBy("month","vin").count()
df7.show(5)

+----------+-----------------+-----+
|     month|              vin|count|
+----------+-----------------+-----+
|2019-04-01|LMWHP1S29J1000107|  185|
|2019-04-01|LMWHP1S51J1004744|   46|
|2019-04-01|LMWHP1S52K1007458|   81|
|2019-04-01|LMWHP1S55J1000566|   49|
|2019-04-01|LMWHP1S81J1001210|   69|
+----------+-----------------+-----+
only showing top 5 rows



In [28]:
df_trip = df6.alias("a").join(df7.alias("b"),(df6.vin == df7.vin) & (df6.month == df7.month))\
.select("a.vin","a.month","a.total_trip_duration_per_month","b.count")
df_trip.show(5)

+-----------------+----------+-----------------------------+-----+
|              vin|     month|total_trip_duration_per_month|count|
+-----------------+----------+-----------------------------+-----+
|LMGAJ1S20H1000487|2017-10-01|                       1336.0|   28|
|LMGAJ1S20H1000487|2019-03-01|           1020.3333333333335|   16|
|LMGAJ1S20H1000604|2018-03-01|            835.8333333333333|   15|
|LMGAJ1S20H1001140|2018-07-01|                       2463.0|   62|
|LMGAJ1S20H1001493|2019-04-01|            2187.166666666667|   44|
+-----------------+----------+-----------------------------+-----+
only showing top 5 rows



In [29]:
df_trip = df_trip.withColumnRenamed("count","trip_count")
df_trip.show(5)

+-----------------+----------+-----------------------------+----------+
|              vin|     month|total_trip_duration_per_month|trip_count|
+-----------------+----------+-----------------------------+----------+
|LMGAJ1S20H1000487|2017-10-01|                       1336.0|        28|
|LMGAJ1S20H1000487|2019-03-01|           1020.3333333333334|        16|
|LMGAJ1S20H1000604|2018-03-01|            835.8333333333334|        15|
|LMGAJ1S20H1001140|2018-07-01|           2463.0000000000005|        62|
|LMGAJ1S20H1001493|2019-04-01|           2187.1666666666665|        44|
+-----------------+----------+-----------------------------+----------+
only showing top 5 rows



In [42]:
df_trip = df_trip.withColumn("average_trip_duration_per_month",(F.col("total_trip_duration_per_month") / F.col("trip_count")))
#df_trip.show(5)

In [43]:
mydf = df.alias("a").join(df_trip.alias("b"),(df.vin == df_trip.vin) & (df.month == df_trip.month))\
.select("a.vin","a.month","a.average_charging_duration_per_month","a.total_duration_per_month",\
        "a.charging_count","b.total_trip_duration_per_month","b.trip_count","b.average_trip_duration_per_month")
mydf = mydf.withColumn("total_trip_hours_per_month",F.col("total_trip_duration_per_month")/60.0)
#mydf.show(5)
mydf = mydf.select("vin","month",expr("round(average_charging_duration_per_month,2)").alias("average_charging_duration_per_month"),"total_duration_per_month",\
        "charging_count","trip_count",expr("int(average_trip_duration_per_month)").alias("average_trip_minutes_per_month"),\
            expr("round(total_trip_hours_per_month,2)").alias("total_trip_hours_per_month"))
mydf.show(5)

+-----------------+----------+-----------------------------------+------------------------+--------------+----------+------------------------------+--------------------------+
|              vin|     month|average_charging_duration_per_month|total_duration_per_month|charging_count|trip_count|average_trip_minutes_per_month|total_trip_hours_per_month|
+-----------------+----------+-----------------------------------+------------------------+--------------+----------+------------------------------+--------------------------+
|LMGAJ1S20H1000487|2017-10-01|                               3.20|                   6.397|             2|        28|                            47|                     22.27|
|LMGAJ1S20H1000604|2018-03-01|                               1.70|                  11.930|             7|        15|                            55|                     13.93|
|LMGAJ1S20H1001140|2018-07-01|                               0.83|                   0.825|             1|        62|   

In [44]:
mydf = mydf.withColumnRenamed("total_duration_per_month","total_charging_hours_per_month")
mydf = mydf.withColumnRenamed("average_charging_duration_per_month","average_charging_hours_per_month")
#mydf = mydf.withColumnRenamed("average_trip_duration_per_month","average_trip_minutes_per_month")
#mydf = mydf.withColumnRenamed("total_trip_duration_per_month","total_trip_minutes_per_month")
mydf.show(5)

+-----------------+----------+--------------------------------+------------------------------+--------------+----------+------------------------------+--------------------------+
|              vin|     month|average_charging_hours_per_month|total_charging_hours_per_month|charging_count|trip_count|average_trip_minutes_per_month|total_trip_hours_per_month|
+-----------------+----------+--------------------------------+------------------------------+--------------+----------+------------------------------+--------------------------+
|LMGAJ1S20H1000487|2017-10-01|                            3.20|                         6.397|             2|        28|                            47|                     22.27|
|LMGAJ1S20H1000604|2018-03-01|                            1.70|                        11.930|             7|        15|                            55|                     13.93|
|LMGAJ1S20H1001140|2018-07-01|                            0.83|                         0.825|           

In [None]:
cols = [when(~col(x).isin("NULL", "NA", "NaN",""), col(x)).alias(x) for x in mydf.columns]
mydf = mydf.select(*cols)
mydf.registerTempTable('trip_charging_info')
sql_cmd = """CREATE TABLE guobiao_tsp_tbls.monthly_trip_and_charging_statistics AS SELECT vin,month,
    average_charging_hours_per_month,total_charging_hours_per_month,charging_count,total_trip_hours_per_month,
    trip_count,average_trip_minutes_per_month
    FROM trip_charging_info"""
print(sql_cmd)
hc.sql(sql_cmd)
print('Table monthly_trip_and_charging_statistics creation done.')

CREATE TABLE guobiao_tsp_tbls.monthly_trip_and_charging_statistics AS SELECT vin,month,
    average_charging_hours_per_month,total_charging_hours_per_month,charging_count,total_trip_hours_per_month,
    trip_count,average_trip_minutes_per_month
    FROM trip_charging_info
