In [1]:
import findspark
findspark.init('/usr/hdp/current/spark2-client')

import pyspark
import numpy as np
import matplotlib.pyplot as plt
import scipy.io as sio
import math
import seaborn as sns
import pandas as pd
%matplotlib inline
from pyspark.sql.functions import lit, col, instr, expr, pow, round, bround, corr, count, mean, stddev_pop, min, max
from pyspark.sql.functions import monotonically_increasing_id, initcap, lower, upper, ltrim, rtrim, rpad, lpad, trim
from pyspark.sql.functions import regexp_replace, translate, regexp_extract, current_date, current_timestamp, struct
from pyspark.sql.functions import date_add, date_sub, datediff, months_between, to_date, to_timestamp, coalesce, split, size
from pyspark.sql.functions import array_contains, explode, udf
from pyspark.sql import HiveContext
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import col, when

from pyspark.sql.types import StructField, StructType, StringType, IntegerType, DoubleType, FloatType, LongType

In [2]:
def get_Spark():

    conf = pyspark.SparkConf().setAll([
        ('spark.submit.deployMode', 'client'), # deploy in yarn-client or yarn-cluster
        ('spark.executor.memory', '10g'),       # memory allocated for each executor
        ('spark.executor.cores', '10'),         # number of cores for each executor
        ('spark.executor.instances', '10'),    # number of executors in total
        ('spark.yarn.am.memory', '20g')])      # memory for spark driver (application master)
    spark = SparkSession.builder \
    .master("yarn") \
    .appName("sensor") \
    .enableHiveSupport() \
    .config(conf = conf) \
    .getOrCreate()

    return spark

spark = get_Spark()
spark_context = spark.sparkContext
hc = HiveContext(spark_context)

In [3]:
def hive2spark(hc, query):
    spark_df = hc.sql("""{}""".format(query))
    return spark_df

In [4]:
a26_query = "select * from guobiao_tsp_tbls.recent_abnormal_sensor_ranking where day='20200704'"
df_a26 = hc.sql(a26_query).toPandas()
df_a26.head()

Unnamed: 0,vin,sensor_id,frequency,day,error_code
0,LMGHP1S54J1003258,3,15,20200704,2
1,LMWHP1S81J1001935,7,14,20200704,2
2,LMGHP1S59J1003417,3,13,20200704,2
3,LMWHP1S81J1001935,8,13,20200704,2
4,LMWHP1S23J1001897,7,10,20200704,2


In [5]:
vtp_query="select * from guobiao_tsp_tbls.vintypes"
vintype_df = hc.sql(vtp_query).toPandas()
vintype_df.head()

Unnamed: 0,vin,vintype,start_day,latest_day
0,LMGGN1S56F1000483,AG.72,2018-11-08,2019-06-24
1,LMGGN1S52F1001114,AG.79,2018-11-08,2019-06-24
2,LMGGN1S50G1001761,AG.79,2019-06-01,2019-06-23
3,LMGFJ1S85H1S00176,A51,2017-08-10,2018-05-07
4,LMGAJ1S88H1S00215,A2APHEV,2017-08-07,2019-02-16


In [6]:
time_query="select * from guobiao_tsp_tbls.ge3_daily_temperature_aggregation_result where day > '2020-05-01'"
time_df = hc.sql(time_query).toPandas()
time_df.head()

Unnamed: 0,vin,time_period,error_code,sensor_ids,severity_level,day
0,LMGHP1S53J1002814,2020-06-14 14:00,2,39,0.16,2020-06-14
1,LMGHP1S54H1000418,2020-06-14 10:00,3,1|2|3|4|5|6|7|8|9|10|11|12|13|14|15|16|17|18|1...,0.01,2020-06-14
2,LMGHP1S55H1001366,2020-06-14 16:00,2,39,0.18,2020-06-14
3,LMGHP1S59J1003417,2020-06-14 01:00,2,3,0.81,2020-06-14
4,LMGHP1S86H1001497,2020-06-14 15:00,1,unknown,1.0,2020-06-14


In [7]:
tmp_time_df = pd.merge(time_df, vintype_df, on='vin', how='left')
tmp_time_df.head()

Unnamed: 0,vin,time_period,error_code,sensor_ids,severity_level,day,vintype,start_day,latest_day
0,LMGHP1S53J1002814,2020-06-14 14:00,2,39,0.16,2020-06-14,A5HEV,2017-12-09,2020-07-15
1,LMGHP1S54H1000418,2020-06-14 10:00,3,1|2|3|4|5|6|7|8|9|10|11|12|13|14|15|16|17|18|1...,0.01,2020-06-14,A5HEV,2017-09-06,2020-07-15
2,LMGHP1S55H1001366,2020-06-14 16:00,2,39,0.18,2020-06-14,A5HEV,2017-11-13,2020-07-15
3,LMGHP1S59J1003417,2020-06-14 01:00,2,3,0.81,2020-06-14,A5HEV,2018-02-27,2020-07-15
4,LMGHP1S86H1001497,2020-06-14 15:00,1,unknown,1.0,2020-06-14,A5HEV,2017-11-30,2020-07-15


In [8]:
pd.set_option('display.max_columns', None)
print(tmp_time_df)

                    vin       time_period  error_code  \
0     LMGHP1S53J1002814  2020-06-14 14:00           2   
1     LMGHP1S54H1000418  2020-06-14 10:00           3   
2     LMGHP1S55H1001366  2020-06-14 16:00           2   
3     LMGHP1S59J1003417  2020-06-14 01:00           2   
4     LMGHP1S86H1001497  2020-06-14 15:00           1   
5     LMGHP1S86H1001497  2020-06-14 17:00           1   
6     LMGHP1S51H1001090  2020-05-04 13:00           3   
7     LMGHP1S85H1000664  2020-05-04 11:00           3   
8     LMWHP1S8XJ1S00045  2020-05-17 13:00           3   
9     LMGHP1S58J1002081  2020-05-20 17:00           3   
10    LMGHP1S54J1003163  2020-06-13 17:00           2   
11    LMGHP1S57H1001210  2020-06-13 16:00           3   
12    LMGHP1S86H1000141  2020-06-13 21:00           1   
13    LMGHP1S86H1001497  2020-06-13 08:00           1   
14    LMGHP1S86H1001497  2020-06-13 14:00           1   
15    LMGHP1S86H1001497  2020-06-13 15:00           1   
16    LMGHP1S55J1001941  2020-0

In [33]:
tmp_time_df.loc[tmp_time_df['vin']=='LMGHP1S51J1003895'].sort_values('time_period', ascending=False)

Unnamed: 0,vin,time_period,error_code,sensor_ids,severity_level,day,vintype,start_day,latest_day
3566,LMGHP1S51J1003895,2020-07-13 14:00,2,23,0.83,2020-07-13,A5HEV,2018-03-19,2020-07-13
5323,LMGHP1S51J1003895,2020-07-12 17:00,2,23,0.35,2020-07-12,A5HEV,2018-03-19,2020-07-13
4642,LMGHP1S51J1003895,2020-07-01 14:00,2,23,1.0,2020-07-01,A5HEV,2018-03-19,2020-07-13
3977,LMGHP1S51J1003895,2020-06-30 21:00,2,23,1.0,2020-06-30,A5HEV,2018-03-19,2020-07-13
3976,LMGHP1S51J1003895,2020-06-30 09:00,2,23,1.0,2020-06-30,A5HEV,2018-03-19,2020-07-13
4567,LMGHP1S51J1003895,2020-06-29 20:00,2,23,0.72,2020-06-29,A5HEV,2018-03-19,2020-07-13
4566,LMGHP1S51J1003895,2020-06-27 20:00,2,23,0.69,2020-06-27,A5HEV,2018-03-19,2020-07-13
2216,LMGHP1S51J1003895,2020-06-19 07:00,2,23,0.74,2020-06-19,A5HEV,2018-03-19,2020-07-13
428,LMGHP1S51J1003895,2020-06-18 20:00,2,23,0.33,2020-06-18,A5HEV,2018-03-19,2020-07-13
227,LMGHP1S51J1003895,2020-06-11 09:00,2,23,0.33,2020-06-11,A5HEV,2018-03-19,2020-07-13


In [211]:
vin='LMGHP1S86H1000141'
start_day = '20200601'
end_day = '20200709'
sensor_id = 'sensor_39'
#sensor_id2 = 'sensor_18'

In [216]:
def get_temp_range(vin, start_day, end_day, sensor_id):
    import time
    stime = time.time()  
    
    query = "select vin, esd_sc_temp_list as esd_sc_temp_list from guobiao_tsp_tbls.guobiao_raw_orc \
    where vin = '{}' and day >= '{}' and day <= '{}'".format(vin, start_day, end_day)
    df = hc.sql(query).toPandas()
    
    num_modules = 40 #36 for A74, 40 for A5HEV
    temperature_list = ["sensor_" + str(s) for s in range(1, num_modules+1)]
    
    df = df.rename({"guobiao_raw_orc.esd_sc_temp_list":"esd_sc_temp_list"})
    
    df[temperature_list] = df.esd_sc_temp_list.str.split("|",expand=True) 
    #print(df[temperature_list])
    
    null_signals = np.all(pd.isnull(df[temperature_list]), axis = 1)
    
    #df = df.replace([np.inf, -np.inf], np.nan).dropna(subset=[temperature_list], how="all")
    
    try:
        df[sensor_id] = df[sensor_id].loc[~null_signals].astype(np.float)
    except ValueError, e:
        print df
        
    min_value, max_value = df[sensor_id].min(), df[sensor_id].nlargest(3) #df[sensor_id].max()
    #min_value2, max_value2 = df[sensor_id2].min(), df[sensor_id2].max()
    
    etime = time.time()
    print "Finished in {} seconds".format(etime - stime)
    
    return min_value, max_value #, min_value2, max_value2

In [217]:
get_temp_range(vin, start_day, end_day, sensor_id) #, sensor_id2)

Finished in 27.9512879848 seconds


(19.0, 34     206.0
 488    206.0
 994    206.0
 Name: sensor_39, dtype: float64)

In [138]:
query = "select vin, esd_sc_temp_list as esd_sc_temp_list from guobiao_tsp_tbls.guobiao_raw_orc \
where vin = '{}' and day >= '{}' and day <= '{}'".format(vin, start_day, end_day)
df = hc.sql(query).toPandas()

In [142]:
num_modules = 36 #36 for A74, 40 for A5HEV
temperature_list = ["sensor_" + str(s) for s in range(1, num_modules+1)]

In [143]:
df[temperature_list] = df.esd_sc_temp_list.str.split("|",expand=True)

In [144]:
null_signals = np.all(pd.isnull(df[temperature_list]), axis = 1)

In [147]:
df[temperature_list]

Unnamed: 0,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,sensor_8,sensor_9,sensor_10,sensor_11,sensor_12,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21,sensor_22,sensor_23,sensor_24,sensor_25,sensor_26,sensor_27,sensor_28,sensor_29,sensor_30,sensor_31,sensor_32,sensor_33,sensor_34,sensor_35,sensor_36
0,35,35,35,35,35,35,35,35,35,35,35,36,35,35,35,35,35,35,35,35,34,35,35,36,35,35,35,35,35,35,35,35,35,35,35,35
1,31,30,31,30,31,31,31,30,31,31,30,30,31,31,31,31,31,31,31,30,30,31,31,31,31,30,31,31,30,31,31,31,31,31,31,31
2,35,35,35,35,35,35,35,35,35,35,35,36,35,35,34,35,35,35,35,34,34,35,35,36,35,35,35,35,35,35,35,34,35,34,35,35
3,32,32,32,32,32,32,32,32,32,32,32,33,32,32,32,32,32,32,32,32,32,32,32,33,32,32,32,32,32,32,32,32,32,32,32,32
4,35,35,35,35,35,36,35,35,35,35,35,36,36,36,35,35,35,35,35,35,35,35,35,37,35,36,35,35,35,35,35,35,35,35,35,35
5,34,35,35,35,35,35,34,34,34,35,35,35,35,35,34,34,34,34,34,34,34,35,34,36,35,35,34,35,34,35,35,34,34,34,34,35
6,35,35,35,35,35,36,35,35,35,35,35,36,36,36,35,35,35,35,35,35,35,35,35,37,35,36,35,35,35,35,35,35,35,35,35,35
7,34,34,34,34,34,34,34,33,34,34,34,35,34,34,34,34,34,34,33,33,33,34,34,35,34,34,34,34,34,34,34,34,34,33,34,34
8,35,35,35,35,35,35,35,35,35,35,35,36,36,35,35,35,35,35,35,35,35,35,35,36,35,35,35,35,35,35,35,35,35,35,35,35
9,34,35,34,35,34,35,34,34,34,35,34,35,35,35,34,34,34,34,34,34,34,35,34,36,34,35,34,35,34,34,35,34,34,34,34,35


In [166]:
df[temperature_list] = df[temperature_list].loc[~null_signals].astype(np.float)

ValueError: could not convert string to float: 

In [None]:


    
    min_value, max_value = df[sensor_id].min(), df[sensor_id].max()
    
    etime = time.time()
    print "Finished in {} seconds".format(etime - stime)