In [3]:
# from vehicle import *
# from veh_stats import get_distance_driven
from datetime import datetime, date
import subprocess
import findspark
import sys
# import gmplot

# spark location on namenode server
findspark.init("/usr/hdp/current/spark2-client")
import pyspark
from pyspark.sql import HiveContext

In [4]:
COL_NUM_DICT = {1: 'TDATE', 2: 'SDATE', 16: 'CCS_CHARGECUR', 58: 'HCU_BATCHRGDSP',
                60: 'ICM_TOTALODOMETER', 74: 'LON84', 75: 'LAT84'}

In [4]:
# configs
conf = pyspark.SparkConf().setAll([('spark.app.name', 'daily_trips'),
                                   ('spark.master', 'yarn'),
                                   ('spark.submit.deployMode', 'client'),
                                   ('spark.executor.memory', '10g'),
                                   ('spark.memory.fraction', '0.7'),
                                   ('spark.executor.cores', '3'),
                                   ('spark.executor.instances', '40'),
                                   ('spark.yarn.am.memory', '20g')])
# conf1 = pyspark.SparkConf().setAll([('spark.app.name', 'export_to_hive'),
#                                     ('spark.master', 'local'),
#                                     ('spark.executor.memory', '10g'),
#                                     ('spark.memory.fraction', '0.7'),
#                                     ('spark.executor.cores', '3')])

In [5]:
sc = pyspark.SparkContext(conf=conf)

In [6]:
sc

In [6]:
def human_readable_time(t):
    t = int(t)
    if t < 60:
        return '{:02d}s'.format(t)
    elif t < 3600:
        m = t // 60
        s = t % 60
        return '{:02d}m{:02d}s'.format(m, s)
    else:
        h = t // 3600
        m = (t - h*3600) // 60
        s = t % 60
        return '{}h{:02d}m{:02d}s'.format(h, m, s)

In [7]:
def transform_to_tuple(line):
    """
    input a line, get required signals as specified in COL_NUM_DICT
    """
    fields = line.split(",")
    vin = fields[0]
    otherfields = {}
    for col_index, col in COL_NUM_DICT.items():
        this_value = fields[int(col_index)]
        otherfields[col] = this_value
    return vin, otherfields

In [8]:
def filter_driving(df):
    filter1 = df['ICM_TOTALODOMETER'] > 0
    filter2 = (df['CCS_CHARGECUR'] < 0.1) | (df['CCS_CHARGECUR'].isna())
    filter3 = (df['HCU_BATCHRGDSP'] == 0) | (df['HCU_BATCHRGDSP'].isna())
    filters = filter1 & filter2 & filter3
    return df.loc[filters].copy()

In [10]:
def _trip_stats_helper(df, vin, max_disrupt=10):
    # drop rows that has null gps value
    df = df.dropna(subset=['LAT84', 'LON84'])
    df = df.reset_index(drop=True)
    
    res = {}
    res['stime'] = []
    res['etime'] = []
    res['slat'] = []
    res['slon'] = []
    res['elat'] = []
    res['elon'] = []
    res['dist']  = []
    
    if df.empty:
        return pd.DataFrame(res)
    
    # add time difference column, convert to seconds
    df.loc[:, 'tdiff'] = df['TDATE'].diff().fillna(timedelta(seconds=10))
    df['tdiff'] = df['tdiff'].apply(lambda x: int(x.seconds))
    
    # get indices where time difference longer than threshold
    indices = df.index[df['tdiff'] > max_disrupt*60].tolist()
    indices.insert(0, 0)
    if indices[-1] < df.shape[0] - 1:
        indices.append(df.shape[0])
    
    gmap = gmplot.GoogleMapPlotter(np.mean(df['LAT84']), np.mean(df['LON84']), 13)
    for i in range(len(indices) - 1):
        lo = indices[i]
        hi = indices[i+1]
        
        if lo + 1 == hi:
            continue
            
        res['stime'].append(df['TDATE'].iloc[lo])
        res['slat'].append(df['LAT84'].iloc[lo])
        res['slon'].append(df['LON84'].iloc[lo])
        res['etime'].append(df['TDATE'].iloc[hi-1])
        res['elat'].append(df['LAT84'].iloc[hi-1])
        res['elon'].append(df['LON84'].iloc[hi-1])
        
        df_seg = df.iloc[lo:hi, :].copy()
        res['dist'].append(get_distance_driven(df_seg))
        
        lats = df_seg['LAT84'].tolist()
        lons = df_seg['LON84'].tolist()
        gmap.scatter(lats, lons, color=colors[i], size=100, marker=False)
        gmap.marker(lats[0], lons[0], 'y') # start
        gmap.marker(lats[-1], lons[-1], 'r') # end
        
    gmap.draw("my_map_{}.html".format(vin))
    res_df = pd.DataFrame(res)
    res_df['vin'] = vin
    if res_df.empty:
        return res_df
    res_df['dura_sec'] = res_df.apply(lambda x: int((x['etime']-x['stime']).total_seconds()), axis=1)
    res_df['duration'] = res_df['dura_sec'].apply(lambda x: human_readable_time(x))
    return res_df

____

In [None]:
df = Vehicle('ag_20150121.csv').df

df = df[COL_NUM_DICT.values() + ['VIN']]

In [44]:
vin = 'LMGGN1S54E1000027'
df1 = df[df['VIN'] == vin]
tt = _trip_stats_helper(df1.copy(), vin)
tt

(2, 9)


Unnamed: 0,dist,elat,elon,etime,slat,slon,stime,vin


In [None]:
tt = pd.DataFrame()
vins = df['VIN'].unique()
for vin in vins:
    print(vin)
    df1 = df[df['VIN'] == vin]
    temp = _trip_stats_helper(df1.copy(), vin)
    if temp.empty:
        continue
    tt = pd.concat([tt, temp])
tt

_____

In [11]:
def compute_trip_stats(x):
    df = pd.DataFrame(list(x[1]))
    df['VIN'] = x[0]
    # initialize in Vehicle meaning convert epoch time to timestamp, sort by tdate, drop duplicate
    veh = Vehicle(df) 
    # filter out driving record only
    df = filter_driving(veh.df)
    
    trip_df = _trip_stats_helper(df.copy(), x[0])
    res = trip_df.to_dict()
    print(x[0])
    return veh.vin, res

In [None]:
sc = pyspark.SparkContext.getOrCreate(conf=conf)

In [58]:
sc.addPyFile('/home/stang/user-profile/stats-spark/veh.zip')

In [65]:
d = 20150201
data_file = 'hdfs://namenode:8020/data/ag/by-day/ag_{}.csv'.format(d)

In [66]:
rdd = sc.textFile(data_file).filter(lambda line: len(line.split(',')) in [85, 86])
res = rdd.map(transform_to_tuple).groupByKey().map(compute_trip_stats).collect()

In [69]:
vals = OrderedDict(res).values()
ff = pd.DataFrame()
for val in vals:
    temp = pd.DataFrame(val)
    if temp.empty:
        continue
    ff = pd.concat([ff, temp])
ff

Unnamed: 0,dist,dura_sec,duration,elat,elon,etime,slat,slon,stime,vin
0,6.0,1234,20m34s,22.944952,113.36083,2015-02-01 11:03:46+08:00,22.930046,113.41049,2015-02-01 10:43:12+08:00,LMGGN1S52E1000186
1,1.0,355,05m55s,22.94741,113.36835,2015-02-01 12:24:36+08:00,22.944824,113.36059,2015-02-01 12:18:41+08:00,LMGGN1S52E1000186
2,7.0,1528,25m28s,22.928423,113.41124,2015-02-01 14:26:02+08:00,22.947308,113.36842,2015-02-01 14:00:34+08:00,LMGGN1S52E1000186
3,0.0,71,01m11s,22.930042,113.41052,2015-02-01 17:35:50+08:00,22.928415,113.411125,2015-02-01 17:34:39+08:00,LMGGN1S52E1000186
0,8.0,702,11m42s,23.118774,113.3359,2015-02-01 00:21:02+08:00,23.10422,113.364815,2015-02-01 00:09:20+08:00,LMGGN1S53E1000035
0,0.0,0,00s,23.162819,113.26866,2015-02-01 10:08:09+08:00,23.162819,113.26866,2015-02-01 10:08:09+08:00,LMGGN1S51E1000194
1,0.0,436,07m16s,23.162992,113.268715,2015-02-01 10:32:47+08:00,23.162863,113.26865,2015-02-01 10:25:31+08:00,LMGGN1S51E1000194
2,51.0,5405,1h30m05s,23.029043,113.448395,2015-02-01 12:17:48+08:00,23.162886,113.26878,2015-02-01 10:47:43+08:00,LMGGN1S51E1000194
3,10.0,1710,28m30s,23.036777,113.427956,2015-02-01 13:25:17+08:00,23.029097,113.44844,2015-02-01 12:56:47+08:00,LMGGN1S51E1000194
4,0.0,212,03m32s,23.036676,113.4281,2015-02-01 14:37:01+08:00,23.036741,113.42814,2015-02-01 14:33:29+08:00,LMGGN1S51E1000194


________