In [1]:
import findspark
findspark.init('/usr/hdp/current/spark2-client')

import pyspark
import numpy as np
import matplotlib.pyplot as plt
import scipy.io as sio
import math
import seaborn as sns
import pandas as pd
%matplotlib inline
from pyspark.sql.functions import lit, col, instr, expr, pow, round, bround, corr, count, mean, stddev_pop, min, max
from pyspark.sql.functions import monotonically_increasing_id, initcap, lower, upper, ltrim, rtrim, rpad, lpad, trim
from pyspark.sql.functions import regexp_replace, translate, regexp_extract, current_date, current_timestamp, struct
from pyspark.sql.functions import date_add, date_sub, datediff, months_between, to_date, to_timestamp, coalesce, split, size
from pyspark.sql.functions import array_contains, explode, udf
from pyspark.sql import HiveContext
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import col, when

from pyspark.sql.types import StructField, StructType, StringType, IntegerType, DoubleType, FloatType, LongType

In [2]:
def get_Spark():

    conf = pyspark.SparkConf().setAll([
        ('spark.submit.deployMode', 'client'), # deploy in yarn-client or yarn-cluster
        ('spark.executor.memory', '10g'),       # memory allocated for each executor
        ('spark.executor.cores', '10'),         # number of cores for each executor
        ('spark.executor.instances', '10'),    # number of executors in total
        ('spark.yarn.am.memory', '20g')])      # memory for spark driver (application master)
    spark = SparkSession.builder \
    .master("yarn") \
    .appName("sensor") \
    .enableHiveSupport() \
    .config(conf = conf) \
    .getOrCreate()

    return spark

spark = get_Spark()
spark_context = spark.sparkContext
hc = HiveContext(spark_context)

In [3]:
def hive2spark(hc, query):
    spark_df = hc.sql("""{}""".format(query))
    return spark_df

In [5]:
a26_query = "select * from a26_tsp_tbls.recent_abnormal_sensor_ranking where day='20200709'"
df_a26 = hc.sql(a26_query).toPandas()
df_a26.head()

Unnamed: 0,vin,sensor_id,frequency,day,error_code
0,LNAA2AA16K5016664,10,3,20200709,4
1,LNAA2AA16K5016664,11,3,20200709,4
2,LNAA2AA16K5016664,12,3,20200709,4
3,LNAA2AA16K5016664,13,3,20200709,4
4,LNAA2AA16K5016664,14,3,20200709,4


In [6]:
vtp_query="select * from a26_tsp_tbls.vintypes"
vintype_df = hc.sql(vtp_query).toPandas()
vintype_df.head()

Unnamed: 0,vin,vintype,start_day,latest_day
0,LNAA2AA15K5027400,A26,2019-12-11,2020-06-22
1,LNAA3AA16L5400412,A26,2020-06-11,2020-06-22
2,LNAA2AA18L5046170,A26,2020-06-18,2020-06-22
3,LNAA2AA19K5027318,A26,2019-12-06,2020-06-22
4,LNAA2AA15L5045400,A26,2020-06-03,2020-06-22


In [7]:
query="select * from a26_tsp_tbls.numbers_voltage_cells"
number_of_cells_df = hc.sql(query).toPandas()

number_of_cells_df["number_of_cells"] = number_of_cells_df["number_of_cells"].astype(int)

tmp = pd.merge(vintype_df.loc[vintype_df["vintype"].isin(['A26','A12'])], number_of_cells_df, on='vin', how='inner')

tmp["number_of_cells"] = tmp["number_of_cells"].astype(str)
tmp['vintype'] = tmp[['vintype', 'number_of_cells']].agg('_'.join, axis=1)
tmp.drop('number_of_cells', inplace=True, axis=1)
tmp.head()

Unnamed: 0,vin,vintype,start_day,latest_day
0,LNAA2AA15K5027400,A26_92,2019-12-11,2020-06-22
1,LNAA3AA16L5400412,A26_92,2020-06-11,2020-06-22
2,LNAA2AA18L5046170,A26_90,2020-06-18,2020-06-22
3,LNAA2AA19K5027318,A26_92,2019-12-06,2020-06-22
4,LNAA2AA15L5045400,A26_90,2020-06-03,2020-06-22


In [8]:
tmp_df = pd.merge(df_a26, tmp, on='vin', how='inner')
tmp_df.head()

Unnamed: 0,vin,sensor_id,frequency,day,error_code,vintype,start_day,latest_day
0,LNAA2AA16K5016664,10,3,20200709,4,A26_90,2019-10-13,2020-07-13
1,LNAA2AA16K5016664,11,3,20200709,4,A26_90,2019-10-13,2020-07-13
2,LNAA2AA16K5016664,12,3,20200709,4,A26_90,2019-10-13,2020-07-13
3,LNAA2AA16K5016664,13,3,20200709,4,A26_90,2019-10-13,2020-07-13
4,LNAA2AA16K5016664,14,3,20200709,4,A26_90,2019-10-13,2020-07-13


In [11]:
tmp_df.sort_values('frequency', ascending=False, inplace=True)

In [12]:
pd.set_option('display.max_rows', None)
print(tmp_df)

                  vin sensor_id  frequency       day  error_code vintype  \
20  LNAA2AA11K5017852        27         29  20200709           5  A26_90   
21  LNAA2AA11K5017852        28         29  20200709           5  A26_90   
38  LNAA2AA15K5013819         7         29  20200709           5  A26_90   
25  LNAA2AA17K5014194        26         29  20200709           5  A26_90   
24  LNAA2AA17K5014194        25         29  20200709           5  A26_90   
23  LNAA2AA13K5014595        20         29  20200709           5  A26_90   
22  LNAA2AA13K5014595        19         29  20200709           5  A26_90   
39  LNAA2AA15K5013819         8         29  20200709           5  A26_90   
27  LNAA2AA19K5014858        28         28  20200709           5  A26_90   
26  LNAA2AA19K5014858        27         28  20200709           5  A26_90   
31  LNAA2AA19K5017789        14         27  20200709           5  A26_90   
30  LNAA2AA19K5017789        13         27  20200709           5  A26_90   
29  LNAA2AA1

In [13]:
time_query="select * from a26_tsp_tbls.a26_daily_temperature_aggregation_result where day > '2020-05-01'"
time_df = hc.sql(time_query).toPandas()
time_df.head()

Unnamed: 0,vin,time_period,error_code,sensor_ids,severity_level,day
0,LNAB2AB3XL5502562,2020-06-12 07:00,1,unknown,0.21,2020-06-12
1,LNAB2AB3XL5502562,2020-06-12 08:00,1,unknown,0.21,2020-06-12
2,LNAB2AB3XL5502562,2020-06-12 12:00,1,unknown,0.21,2020-06-12
3,LNAB2AB3XL5502562,2020-06-12 14:00,1,unknown,0.21,2020-06-12
4,LNAB2AB3XL5502562,2020-06-12 15:00,1,unknown,0.21,2020-06-12


In [14]:
tmp_time_df = pd.merge(tmp_df, time_df, on='vin', how='left')
tmp_time_df.head()

Unnamed: 0,vin,sensor_id,frequency,day_x,error_code_x,vintype,start_day,latest_day,time_period,error_code_y,sensor_ids,severity_level,day_y
0,LNAA2AA11K5017852,27,29,20200709,5,A26_90,2019-10-16,2020-07-13,2020-06-17 00:00,5.0,27|28,0.57,2020-06-17
1,LNAA2AA11K5017852,27,29,20200709,5,A26_90,2019-10-16,2020-07-13,2020-06-17 01:00,5.0,27|28,0.49,2020-06-17
2,LNAA2AA11K5017852,27,29,20200709,5,A26_90,2019-10-16,2020-07-13,2020-06-17 02:00,5.0,27|28,0.62,2020-06-17
3,LNAA2AA11K5017852,27,29,20200709,5,A26_90,2019-10-16,2020-07-13,2020-06-17 03:00,5.0,27|28,0.69,2020-06-17
4,LNAA2AA11K5017852,27,29,20200709,5,A26_90,2019-10-16,2020-07-13,2020-06-17 09:00,5.0,27|28,0.75,2020-06-17


In [16]:
pd.set_option('display.max_columns', None)
print(tmp_time_df)

                    vin sensor_id  frequency     day_x  error_code_x vintype  \
0     LNAA2AA11K5017852        27         29  20200709             5  A26_90   
1     LNAA2AA11K5017852        27         29  20200709             5  A26_90   
2     LNAA2AA11K5017852        27         29  20200709             5  A26_90   
3     LNAA2AA11K5017852        27         29  20200709             5  A26_90   
4     LNAA2AA11K5017852        27         29  20200709             5  A26_90   
5     LNAA2AA11K5017852        27         29  20200709             5  A26_90   
6     LNAA2AA11K5017852        27         29  20200709             5  A26_90   
7     LNAA2AA11K5017852        27         29  20200709             5  A26_90   
8     LNAA2AA11K5017852        27         29  20200709             5  A26_90   
9     LNAA2AA11K5017852        27         29  20200709             5  A26_90   
10    LNAA2AA11K5017852        27         29  20200709             5  A26_90   
11    LNAA2AA11K5017852        27       

In [30]:
tmp_time_df.loc[tmp_time_df['vin']=='LNAA2AA11K5019312'].sort_values('time_period', ascending=False)

Unnamed: 0,vin,sensor_id,frequency,day_x,error_code_x,vintype,start_day,latest_day,time_period,error_code_y,sensor_ids,severity_level,day_y
8841,LNAA2AA11K5019312,19,25,20200709,5,A26_90,2019-10-21,2020-07-13,2020-07-09 22:00,5.0,19|20,0.49,2020-07-09
8555,LNAA2AA11K5019312,20,25,20200709,5,A26_90,2019-10-21,2020-07-13,2020-07-09 22:00,5.0,19|20,0.49,2020-07-09
8840,LNAA2AA11K5019312,19,25,20200709,5,A26_90,2019-10-21,2020-07-13,2020-07-09 21:00,5.0,19|20,0.48,2020-07-09
8554,LNAA2AA11K5019312,20,25,20200709,5,A26_90,2019-10-21,2020-07-13,2020-07-09 21:00,5.0,19|20,0.48,2020-07-09
8553,LNAA2AA11K5019312,20,25,20200709,5,A26_90,2019-10-21,2020-07-13,2020-07-09 20:00,5.0,19|20,0.54,2020-07-09
8839,LNAA2AA11K5019312,19,25,20200709,5,A26_90,2019-10-21,2020-07-13,2020-07-09 20:00,5.0,19|20,0.54,2020-07-09
8838,LNAA2AA11K5019312,19,25,20200709,5,A26_90,2019-10-21,2020-07-13,2020-07-09 19:00,5.0,19|20,0.64,2020-07-09
8552,LNAA2AA11K5019312,20,25,20200709,5,A26_90,2019-10-21,2020-07-13,2020-07-09 19:00,5.0,19|20,0.64,2020-07-09
8837,LNAA2AA11K5019312,19,25,20200709,5,A26_90,2019-10-21,2020-07-13,2020-07-09 18:00,5.0,19|20,0.72,2020-07-09
8551,LNAA2AA11K5019312,20,25,20200709,5,A26_90,2019-10-21,2020-07-13,2020-07-09 18:00,5.0,19|20,0.72,2020-07-09


In [52]:
vin='LNAA2AA11K5019312'
start_day = '20200601'
end_day = '20200709'
sensor_id = 'sensor_20'

In [53]:
def get_temp_range(vin, start_day, end_day, sensor_ids):
    import time
    stime = time.time()  
    
    query = "select vin, esd_temp_probe_list from a26_tsp_tbls.a26_gb_orc \
    where vin = '{}' and day >= '{}' and day <= '{}'".format(vin, start_day, end_day)
    df = hc.sql(query).toPandas()
    
    num_modules = 32
    temperature_list = ["sensor_" + str(s) for s in range(1, num_modules+1)]
    
    df[temperature_list] = df.esd_temp_probe_list.str.split("|",expand=True) 

    null_signals = np.all(pd.isnull(df[temperature_list]), axis = 1)
    df[temperature_list] = df[temperature_list].loc[~null_signals].astype(np.float)
    
    min_value, max_value = df[sensor_id].min(), df[sensor_id].max()
    
    etime = time.time()
    print "Finished in {} seconds".format(etime - stime)
    
    return min_value, max_value

In [54]:
get_temp_range(vin, start_day, end_day, sensor_id)

Finished in 60.8854222298 seconds


(25.0, 37.0)