In [1]:
import pandas as pd

In [2]:
pd.options.display.max_columns = 100

In [3]:
def load_header_config(config_file, do_save=False):
    """
    Load header and configuration from aglog_header file.

    Parameters
    ----------
    config_file: str, required
        the file that provides header and data type information,
        e.g. aglog_header.csv. If not provided, raise an error.
    do_save: boolean, default False
        save the config_file to a json file if True

    Returns
    -------
    header: list of str, each represents name of a feature
    configs: dict, key is the name of feature, val is another dict with
        keys of feature_type and stat_to_computed. e.g.
        {'feature1': {'type': numerical,
                      'stats': ['avg', 'max', 'min']}}
    """
    configs = {}
    header = []
    try:
        df = pd.read_csv(config_file, header=0, index_col='No')
        for index, row in df.iterrows():
            var = row['Column'].strip()
            header.append(var)
            if pd.isnull(row['Type']):
                continue
            var_type = row['Type'].strip()
            if var_type == 'numerical':
                configs[var] = {'type': var_type,
                                'stats': ['avg', 'max', 'min']}
            elif var_type == 'categorical':
                configs[var] = {'type': var_type, 'stats': ['histogram']}
    except Exception as e:
        print("Error reading " + config_file)
        raise e

    if do_save:
        config_json_file = 'config.json'
        with open(config_json_file, 'w') as configf:
            json.dump(configs, configf)
    return header, configs

In [4]:
header, config = load_header_config('aglog_header.csv')

In [5]:
df = pd.read_csv('/gaei/gacrnd/data/csv/d=20180329/part-00000-81ada67b-bb15-4ab6-b9c9-2815b8e77d17.c000.csv',
                header=None, names=header, index_col=False)

In [10]:
df.head()

Unnamed: 0,VIN,TDATE,SDATE,BMS_BATTST,BMS_BATTCURR,BMS_BATTVOLT,BMS_INSULATIONST,BMS_INSULATIONRES,BMS_CELLVOLTMAX,BMS_CELLVOLTMIN,BMS_FAILURELVL,BMS_BATTSOC,BMS_BATTTEMPAVG,BMS_BATTTEMPMAX,BMS_BATTTEMPMIN,CCS_CHARGEVOLT,CCS_CHARGECUR,CCS_CHARGERSTARTST,VCU_SYSFAILMODE,MCU_FTM_ACTROTSPD,MCU_FTM_ACTTORQ,MCU_FTM_STMODE,MCU_FTM_MOTORACTTEMP,MCU_FTM_ROTORACTTEMP,MCU_FTM_INVERTERACTTEMP,MCU_FTM_ACTHV_CUR,MCU_FTM_ACTHV_VOLT,MCU_FTM_FAULT_INFO1,MCU_FTM_FAULT_INFO2,MCU_DCDC_FAILST,MCU_DCDC_STMODE,VCU_DCDC_STMODELREQ,BMS_BAT_ERROR_SOC_L,BMS_BAT_ERROR_CELL_V_H,BMS_BAT_ERROR_CELL_V_L,BMS_BAT_ERROR_PACK_SUMV_H,BMS_BAT_ERROR_PACK_SUMV_L,BMS_BAT_ERROR_CELL_T_H,BMS_BAT_ERROR_T_UNBALANCE,MCU_GM_FAILST,MCU_FTM_FAILST,MCU_FTM_FAULT_INFO3,MCU_GM_ACTROTSPD,MCU_GM_ACTTORQ,MCU_GM_STMODE,MCU_GM_MOTORACTTEMP,MCU_GM_ROTORACTTEMP,MCU_GM_INVERTERACTTEMP,MCU_GM_ACTHV_CUR,MCU_GM_ACTHV_VOL,EMS_ENGTORQ,EMS_ENGSPD,EMS_ACCPEDALPST,EMS_BRAKEPEDALST,EMS_ENGWATERTEMP,HCU_GEARFORDSP,HCU_OILPRESSUREWARN,HCU_AVGFUELCONSUMP,HCU_BATCHRGDSP,BCS_VEHSPD,ICM_TOTALODOMETER,BCM_KEYST,HCU_DSTOIL,HCU_DSTBAT,EMS_FAULTRANKSIG,SRS_CRASHOUTPUTST,SRS_DRIVERSEATBELTST,EDC_STERRLVLCOM,EDC_STERRLVLCOMSUP,EDB_STERRLVLHVES,EDG_STERRLVLGEN,EDM_STERRLVLMOT,EDE_STERRLVLENG,EDV_STERRLVLVEH,LON84,LAT84,LON02,LAT02,BCS_ABSFAULTST,BCS_EBDFAULTST,MCU_DCDC_ACTTEMP,BMS_HVILST,HCU_HEVSYSREADYST,BMS_BALANCEST,GPS_HEADING
0,LMGGN1S58F1000453,1522265026587,20180329,4.0,-31.0,336.0,0.0,1022.0,3.24,3.2,0.0,26.2,33.5,37.0,30.0,,,,0.0,4694.0,3.5,8.0,64.0,59.0,55.0,-32.0,340.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2642.0,-49.5,8.0,68.0,63.0,70.0,-30.0,342.0,38.416,2636.0,0.0,0.0,89.0,12.0,1.0,5.6,0.0,61.9875,240548.0,2.0,136.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,113.317272,23.184966,113.322684,23.182381,,,,,,,
1,LMGGN1S58G1002897,1522265032461,20180329,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,113.309244,23.03913,113.31463,23.036471,,,,,,,0.0
2,LMGGN1S50F1000527,1522265029698,20180329,4.0,0.5,341.0,0.0,1022.0,3.28,3.23,0.0,36.6,32.0,34.0,30.0,,,,0.0,0.0,-0.5,4.0,53.0,48.0,51.0,0.0,338.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.5,4.0,54.0,49.0,54.0,1.0,340.0,0.0,0.0,0.0,0.0,69.0,15.0,0.0,8.7,0.0,0.0,394184.0,2.0,212.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,113.265266,23.119091,113.270596,23.116411,,,,,,,
3,LMGGN1S5XH1005303,1522265037641,20180329,12.0,-5.0,360.0,0.0,1000.0,4.09,4.08,0.0,90.6,29.5,30.0,29.0,359.4,5.5,0.0,0.0,0.0,22.0,4.0,32.0,27.0,47.0,0.0,359.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,4.0,36.0,31.0,48.0,0.0,360.0,,,,,,15.0,1.0,0.0,2.0,,,,22.0,42.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,120.362841,29.272144,120.367338,29.269618,,,48.0,0.0,0.0,0.0,0.0
4,LMGGN1S56G1002509,1522265020589,20180329,12.0,-5.0,347.0,0.0,1000.0,3.94,3.93,0.0,67.6,25.0,25.0,25.0,349.3,5.7,0.0,0.0,0.0,22.5,4.0,42.0,37.0,53.0,1.0,347.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,4.0,42.0,37.0,53.0,0.0,347.0,,,,,,15.0,1.0,0.0,2.0,,,,284.0,12.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,121.408677,31.109888,121.413317,31.10799,,,50.0,0.0,0.0,0.0,0.0


In [12]:
df.shape

(1949678, 85)

In [11]:
part1 = pd.read_csv('/gaei/gacrnd/data/csv/d=20180329/part-00001-81ada67b-bb15-4ab6-b9c9-2815b8e77d17.c000.csv',
                   header=None, names=header, index_col=False)

In [13]:
temp.head()

Unnamed: 0,VIN,TDATE,SDATE,BMS_BATTST,BMS_BATTCURR,BMS_BATTVOLT,BMS_INSULATIONST,BMS_INSULATIONRES,BMS_CELLVOLTMAX,BMS_CELLVOLTMIN,BMS_FAILURELVL,BMS_BATTSOC,BMS_BATTTEMPAVG,BMS_BATTTEMPMAX,BMS_BATTTEMPMIN,CCS_CHARGEVOLT,CCS_CHARGECUR,CCS_CHARGERSTARTST,VCU_SYSFAILMODE,MCU_FTM_ACTROTSPD,MCU_FTM_ACTTORQ,MCU_FTM_STMODE,MCU_FTM_MOTORACTTEMP,MCU_FTM_ROTORACTTEMP,MCU_FTM_INVERTERACTTEMP,MCU_FTM_ACTHV_CUR,MCU_FTM_ACTHV_VOLT,MCU_FTM_FAULT_INFO1,MCU_FTM_FAULT_INFO2,MCU_DCDC_FAILST,MCU_DCDC_STMODE,VCU_DCDC_STMODELREQ,BMS_BAT_ERROR_SOC_L,BMS_BAT_ERROR_CELL_V_H,BMS_BAT_ERROR_CELL_V_L,BMS_BAT_ERROR_PACK_SUMV_H,BMS_BAT_ERROR_PACK_SUMV_L,BMS_BAT_ERROR_CELL_T_H,BMS_BAT_ERROR_T_UNBALANCE,MCU_GM_FAILST,MCU_FTM_FAILST,MCU_FTM_FAULT_INFO3,MCU_GM_ACTROTSPD,MCU_GM_ACTTORQ,MCU_GM_STMODE,MCU_GM_MOTORACTTEMP,MCU_GM_ROTORACTTEMP,MCU_GM_INVERTERACTTEMP,MCU_GM_ACTHV_CUR,MCU_GM_ACTHV_VOL,EMS_ENGTORQ,EMS_ENGSPD,EMS_ACCPEDALPST,EMS_BRAKEPEDALST,EMS_ENGWATERTEMP,HCU_GEARFORDSP,HCU_OILPRESSUREWARN,HCU_AVGFUELCONSUMP,HCU_BATCHRGDSP,BCS_VEHSPD,ICM_TOTALODOMETER,BCM_KEYST,HCU_DSTOIL,HCU_DSTBAT,EMS_FAULTRANKSIG,SRS_CRASHOUTPUTST,SRS_DRIVERSEATBELTST,EDC_STERRLVLCOM,EDC_STERRLVLCOMSUP,EDB_STERRLVLHVES,EDG_STERRLVLGEN,EDM_STERRLVLMOT,EDE_STERRLVLENG,EDV_STERRLVLVEH,LON84,LAT84,LON02,LAT02,BCS_ABSFAULTST,BCS_EBDFAULTST,MCU_DCDC_ACTTEMP,BMS_HVILST,HCU_HEVSYSREADYST,BMS_BALANCEST,GPS_HEADING
0,LMGGN1S52F1001453,1522256801019,20180329,4.0,86.0,311.0,0.0,1022.0,3.56,3.54,0.0,32.9,25.5,26.0,25.0,,,,0.0,1820.0,151.0,8.0,50.0,45.0,40.0,90.0,312.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,54.0,49.0,40.0,90.0,312.0,0.0,0.0,47.432,0.0,112.0,12.0,0.0,5.2,0.0,23.79375,136431.0,2.0,182.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,121.345555,31.311777,121.350144,31.309917,0.0,0.0,43.0,0.0,1.0,0.0,268.0
1,LMGGN1S52F1001453,1522256781019,20180329,4.0,1.0,323.0,0.0,1000.0,3.69,3.67,0.0,33.4,25.5,26.0,25.0,,,,0.0,0.0,22.5,8.0,50.0,45.0,38.0,0.0,324.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,55.0,50.0,39.0,0.0,324.0,0.0,0.0,0.0,1.0,108.0,12.0,0.0,5.2,0.0,0.0,136431.0,2.0,182.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,121.346233,31.311802,121.350823,31.309943,0.0,0.0,42.0,0.0,1.0,0.0,0.0
2,LMGGN1S55G1004316,1522256687824,20180329,12.0,-5.0,358.0,0.0,1000.0,4.07,4.06,0.0,87.0,24.0,24.0,24.0,360.4,5.5,0.0,0.0,0.0,23.0,4.0,36.0,31.0,38.0,0.0,359.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,27.0,22.0,39.0,0.0,357.0,,,,,,15.0,1.0,0.0,2.0,,,,12.0,26.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,116.150136,40.042275,116.15639,40.043628,,,40.0,0.0,0.0,0.0,0.0
3,LMGGN1S52H1004839,1522256804848,20180329,12.0,-5.5,357.0,0.0,1000.0,4.06,4.06,0.0,86.3,32.0,33.0,31.0,358.8,5.5,0.0,0.0,0.0,23.0,4.0,32.0,27.0,47.0,0.0,356.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,37.0,32.0,49.0,0.0,357.0,,,,,,15.0,1.0,0.0,2.0,,,,20.0,39.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,116.606974,28.239761,116.611643,28.23627,,,45.0,0.0,0.0,0.0,0.0
4,LMGGN1S55G1001996,1522256692272,20180329,12.0,-5.5,341.0,0.0,1000.0,3.89,3.88,0.0,60.1,24.5,25.0,24.0,346.1,5.7,0.0,0.0,0.0,22.5,4.0,45.0,40.0,53.0,0.0,343.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.5,4.0,53.0,48.0,55.0,0.0,342.0,,,,,,15.0,1.0,0.0,2.0,,,,10.0,21.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,106.471038,29.689441,106.474978,29.686848,,,53.0,0.0,0.0,0.0,0.0


In [14]:
temp.shape

(1949676, 85)

In [15]:
df = pd.concat([df, part1])

In [16]:
df.shape

(3899354, 85)

In [17]:
part2 = pd.read_csv('/gaei/gacrnd/data/csv/d=20180329/part-00002-81ada67b-bb15-4ab6-b9c9-2815b8e77d17.c000.csv',
                   header=None, names=header, index_col=False)

In [18]:
part2.shape

(1949676, 85)

In [19]:
df = pd.concat([df, part2])

In [20]:
df.shape

(5849030, 85)

In [21]:
part3 = pd.read_csv('/gaei/gacrnd/data/csv/d=20180329/part-00003-81ada67b-bb15-4ab6-b9c9-2815b8e77d17.c000.csv',
                    header=None, names=header, index_col=False)

In [22]:
part3.shape

(1949679, 85)

In [23]:
df = pd.concat([df, part3])

In [24]:
df.shape

(7798709, 85)

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7798709 entries, 0 to 1949678
Data columns (total 85 columns):
VIN                          object
TDATE                        int64
SDATE                        int64
BMS_BATTST                   float64
BMS_BATTCURR                 float64
BMS_BATTVOLT                 float64
BMS_INSULATIONST             float64
BMS_INSULATIONRES            float64
BMS_CELLVOLTMAX              float64
BMS_CELLVOLTMIN              float64
BMS_FAILURELVL               float64
BMS_BATTSOC                  float64
BMS_BATTTEMPAVG              float64
BMS_BATTTEMPMAX              float64
BMS_BATTTEMPMIN              float64
CCS_CHARGEVOLT               float64
CCS_CHARGECUR                float64
CCS_CHARGERSTARTST           float64
VCU_SYSFAILMODE              float64
MCU_FTM_ACTROTSPD            float64
MCU_FTM_ACTTORQ              float64
MCU_FTM_STMODE               float64
MCU_FTM_MOTORACTTEMP         float64
MCU_FTM_ROTORACTTEMP         float64
MCU_

In [26]:
df.head()

Unnamed: 0,VIN,TDATE,SDATE,BMS_BATTST,BMS_BATTCURR,BMS_BATTVOLT,BMS_INSULATIONST,BMS_INSULATIONRES,BMS_CELLVOLTMAX,BMS_CELLVOLTMIN,BMS_FAILURELVL,BMS_BATTSOC,BMS_BATTTEMPAVG,BMS_BATTTEMPMAX,BMS_BATTTEMPMIN,CCS_CHARGEVOLT,CCS_CHARGECUR,CCS_CHARGERSTARTST,VCU_SYSFAILMODE,MCU_FTM_ACTROTSPD,MCU_FTM_ACTTORQ,MCU_FTM_STMODE,MCU_FTM_MOTORACTTEMP,MCU_FTM_ROTORACTTEMP,MCU_FTM_INVERTERACTTEMP,MCU_FTM_ACTHV_CUR,MCU_FTM_ACTHV_VOLT,MCU_FTM_FAULT_INFO1,MCU_FTM_FAULT_INFO2,MCU_DCDC_FAILST,MCU_DCDC_STMODE,VCU_DCDC_STMODELREQ,BMS_BAT_ERROR_SOC_L,BMS_BAT_ERROR_CELL_V_H,BMS_BAT_ERROR_CELL_V_L,BMS_BAT_ERROR_PACK_SUMV_H,BMS_BAT_ERROR_PACK_SUMV_L,BMS_BAT_ERROR_CELL_T_H,BMS_BAT_ERROR_T_UNBALANCE,MCU_GM_FAILST,MCU_FTM_FAILST,MCU_FTM_FAULT_INFO3,MCU_GM_ACTROTSPD,MCU_GM_ACTTORQ,MCU_GM_STMODE,MCU_GM_MOTORACTTEMP,MCU_GM_ROTORACTTEMP,MCU_GM_INVERTERACTTEMP,MCU_GM_ACTHV_CUR,MCU_GM_ACTHV_VOL,EMS_ENGTORQ,EMS_ENGSPD,EMS_ACCPEDALPST,EMS_BRAKEPEDALST,EMS_ENGWATERTEMP,HCU_GEARFORDSP,HCU_OILPRESSUREWARN,HCU_AVGFUELCONSUMP,HCU_BATCHRGDSP,BCS_VEHSPD,ICM_TOTALODOMETER,BCM_KEYST,HCU_DSTOIL,HCU_DSTBAT,EMS_FAULTRANKSIG,SRS_CRASHOUTPUTST,SRS_DRIVERSEATBELTST,EDC_STERRLVLCOM,EDC_STERRLVLCOMSUP,EDB_STERRLVLHVES,EDG_STERRLVLGEN,EDM_STERRLVLMOT,EDE_STERRLVLENG,EDV_STERRLVLVEH,LON84,LAT84,LON02,LAT02,BCS_ABSFAULTST,BCS_EBDFAULTST,MCU_DCDC_ACTTEMP,BMS_HVILST,HCU_HEVSYSREADYST,BMS_BALANCEST,GPS_HEADING
0,LMGGN1S58F1000453,1522265026587,20180329,4.0,-31.0,336.0,0.0,1022.0,3.24,3.2,0.0,26.2,33.5,37.0,30.0,,,,0.0,4694.0,3.5,8.0,64.0,59.0,55.0,-32.0,340.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2642.0,-49.5,8.0,68.0,63.0,70.0,-30.0,342.0,38.416,2636.0,0.0,0.0,89.0,12.0,1.0,5.6,0.0,61.9875,240548.0,2.0,136.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,113.317272,23.184966,113.322684,23.182381,,,,,,,
1,LMGGN1S58G1002897,1522265032461,20180329,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,113.309244,23.03913,113.31463,23.036471,,,,,,,0.0
2,LMGGN1S50F1000527,1522265029698,20180329,4.0,0.5,341.0,0.0,1022.0,3.28,3.23,0.0,36.6,32.0,34.0,30.0,,,,0.0,0.0,-0.5,4.0,53.0,48.0,51.0,0.0,338.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.5,4.0,54.0,49.0,54.0,1.0,340.0,0.0,0.0,0.0,0.0,69.0,15.0,0.0,8.7,0.0,0.0,394184.0,2.0,212.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,113.265266,23.119091,113.270596,23.116411,,,,,,,
3,LMGGN1S5XH1005303,1522265037641,20180329,12.0,-5.0,360.0,0.0,1000.0,4.09,4.08,0.0,90.6,29.5,30.0,29.0,359.4,5.5,0.0,0.0,0.0,22.0,4.0,32.0,27.0,47.0,0.0,359.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,4.0,36.0,31.0,48.0,0.0,360.0,,,,,,15.0,1.0,0.0,2.0,,,,22.0,42.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,120.362841,29.272144,120.367338,29.269618,,,48.0,0.0,0.0,0.0,0.0
4,LMGGN1S56G1002509,1522265020589,20180329,12.0,-5.0,347.0,0.0,1000.0,3.94,3.93,0.0,67.6,25.0,25.0,25.0,349.3,5.7,0.0,0.0,0.0,22.5,4.0,42.0,37.0,53.0,1.0,347.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,4.0,42.0,37.0,53.0,0.0,347.0,,,,,,15.0,1.0,0.0,2.0,,,,284.0,12.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,121.408677,31.109888,121.413317,31.10799,,,50.0,0.0,0.0,0.0,0.0


In [27]:
df.shape

(7798709, 85)

In [31]:
df = df[['VIN', 'TDATE', 'SDATE', 'ICM_TOTALODOMETER', 'CCS_CHARGECUR', 'HCU_BATCHRGDSP', 'LON84', 'LAT84']]

In [33]:
df.head()

Unnamed: 0,VIN,TDATE,SDATE,ICM_TOTALODOMETER,CCS_CHARGECUR,HCU_BATCHRGDSP,LON84,LAT84
0,LMGGN1S58F1000453,1522265026587,20180329,240548.0,,0.0,113.317272,23.184966
1,LMGGN1S58G1002897,1522265032461,20180329,,,,113.309244,23.03913
2,LMGGN1S50F1000527,1522265029698,20180329,394184.0,,0.0,113.265266,23.119091
3,LMGGN1S5XH1005303,1522265037641,20180329,,5.5,2.0,120.362841,29.272144
4,LMGGN1S56G1002509,1522265020589,20180329,,5.7,2.0,121.408677,31.109888


In [34]:
filter1 = df['ICM_TOTALODOMETER'] > 0
filter2 = (df['CCS_CHARGECUR'] < 0.1) | (df['CCS_CHARGECUR'].isna())
filter3 = (df['HCU_BATCHRGDSP'] == 0) | (df['HCU_BATCHRGDSP'].isna())

In [39]:
filter_driving = filter1 & filter2 & filter3
df_drive = df.loc[filter_driving].copy()

In [40]:
df_drive.shape

(5715739, 8)

In [41]:
df_drive.head()

Unnamed: 0,VIN,TDATE,SDATE,ICM_TOTALODOMETER,CCS_CHARGECUR,HCU_BATCHRGDSP,LON84,LAT84
0,LMGGN1S58F1000453,1522265026587,20180329,240548.0,,0.0,113.317272,23.184966
2,LMGGN1S50F1000527,1522265029698,20180329,394184.0,,0.0,113.265266,23.119091
7,LMGGN1S50G1001467,1522265045100,20180329,132453.0,,0.0,121.443705,31.241013
9,LMGGN1S59H1005096,1522265035914,20180329,40008.0,,0.0,121.576077,31.251716
11,LMGGN1S53G1004721,1522265054580,20180329,9132.0,,0.0,113.400308,23.110388


In [42]:
df_drive['TDATE'] = pd.to_datetime(df_drive['TDATE'], unit='ms', utc=True).dt.tz_convert('Asia/Hong_Kong')

In [44]:
df_drive = df_drive.sort_values(by='TDATE')

In [45]:
df_drive.head()

Unnamed: 0,VIN,TDATE,SDATE,ICM_TOTALODOMETER,CCS_CHARGECUR,HCU_BATCHRGDSP,LON84,LAT84
994629,LMGGN1S5XG1003680,2018-03-29 00:00:00.005000+08:00,20180329,70509.0,,0.0,113.494125,23.215091
190047,LMGGN1S51G1002451,2018-03-29 00:00:00.009000+08:00,20180329,114142.0,,0.0,113.218269,23.162391
374178,LMGGN1S53F1000361,2018-03-29 00:00:00.016000+08:00,20180329,235081.0,,0.0,114.39375,30.51018
221403,LMGGN1S51H1005108,2018-03-29 00:00:00.028000+08:00,20180329,44075.0,,0.0,114.027413,22.652313
1481714,LMGGN1S52H1005652,2018-03-29 00:00:00.039000+08:00,20180329,5953.0,,0.0,120.156647,30.264588


In [46]:
df_drive.tail()

Unnamed: 0,VIN,TDATE,SDATE,ICM_TOTALODOMETER,CCS_CHARGECUR,HCU_BATCHRGDSP,LON84,LAT84
444297,LMGGN1S52G1001485,2018-03-29 23:59:59.894000+08:00,20180329,108852.0,,0.0,113.289205,23.094191
444284,LMGGN1S5XG1002190,2018-03-29 23:59:59.910000+08:00,20180329,58597.0,,0.0,121.738838,31.209236
933792,LMGGN1S56G1001473,2018-03-29 23:59:59.925000+08:00,20180329,87690.0,,0.0,121.471391,31.272191
1909218,LMGGN1S52G1001728,2018-03-29 23:59:59.971000+08:00,20180329,69525.0,,0.0,121.448441,31.141469
1908936,LMGGN1S52G1004600,2018-03-29 23:59:59.979000+08:00,20180329,9220.0,,0.0,113.286225,22.910636


In [47]:
df_single = df_drive[df_drive['VIN'] == 'LMGGN1S51F1000326'].copy()

In [49]:
df_single = df_single.sort_values(by='TDATE')

In [59]:
df_single = df_single.dropna(subset=['LAT84', 'LON84'])

In [60]:
df_single = df_single.reset_index(drop=True)

In [61]:
df_single.tail()

Unnamed: 0,VIN,TDATE,SDATE,ICM_TOTALODOMETER,CCS_CHARGECUR,HCU_BATCHRGDSP,LON84,LAT84,TDIFF
725,LMGGN1S51F1000326,2018-03-29 19:25:51.253000+08:00,20180329,67238.0,,0.0,113.200405,23.151766,10
726,LMGGN1S51F1000326,2018-03-29 19:26:01.253000+08:00,20180329,67238.0,,0.0,113.200394,23.15185,10
727,LMGGN1S51F1000326,2018-03-29 19:26:02.021000+08:00,20180329,67238.0,,0.0,113.199861,23.151977,0
728,LMGGN1S51F1000326,2018-03-29 19:26:12.021000+08:00,20180329,67238.0,,0.0,113.19923,23.151825,10
729,LMGGN1S51F1000326,2018-03-29 19:26:22.021000+08:00,20180329,67238.0,,0.0,113.199358,23.151494,10


In [62]:
from datetime import timedelta

In [63]:
df_single['TDIFF'] = df_single['TDATE'].diff().fillna(timedelta(seconds=10))

In [64]:
df_single.head()

Unnamed: 0,VIN,TDATE,SDATE,ICM_TOTALODOMETER,CCS_CHARGECUR,HCU_BATCHRGDSP,LON84,LAT84,TDIFF
0,LMGGN1S51F1000326,2018-03-29 07:21:55.726000+08:00,20180329,67170.0,,0.0,113.202744,23.151836,00:00:10
1,LMGGN1S51F1000326,2018-03-29 07:22:05.726000+08:00,20180329,67170.0,,0.0,113.202763,23.151841,00:00:10
2,LMGGN1S51F1000326,2018-03-29 07:22:15.726000+08:00,20180329,67170.0,,0.0,113.202775,23.151841,00:00:10
3,LMGGN1S51F1000326,2018-03-29 07:22:25.950000+08:00,20180329,67170.0,,0.0,113.202783,23.151844,00:00:10.224000
4,LMGGN1S51F1000326,2018-03-29 07:22:35.950000+08:00,20180329,67170.0,,0.0,113.202788,23.151855,00:00:10


In [65]:
df_single['TDIFF'] = df_single['TDIFF'].apply(lambda x: int(x.seconds))
df_single.head()

Unnamed: 0,VIN,TDATE,SDATE,ICM_TOTALODOMETER,CCS_CHARGECUR,HCU_BATCHRGDSP,LON84,LAT84,TDIFF
0,LMGGN1S51F1000326,2018-03-29 07:21:55.726000+08:00,20180329,67170.0,,0.0,113.202744,23.151836,10
1,LMGGN1S51F1000326,2018-03-29 07:22:05.726000+08:00,20180329,67170.0,,0.0,113.202763,23.151841,10
2,LMGGN1S51F1000326,2018-03-29 07:22:15.726000+08:00,20180329,67170.0,,0.0,113.202775,23.151841,10
3,LMGGN1S51F1000326,2018-03-29 07:22:25.950000+08:00,20180329,67170.0,,0.0,113.202783,23.151844,10
4,LMGGN1S51F1000326,2018-03-29 07:22:35.950000+08:00,20180329,67170.0,,0.0,113.202788,23.151855,10


In [66]:
indices = df_single.index[df_single['TDIFF'] > int(10*60)].tolist()

In [67]:
indices

[264, 273, 382, 725]

In [68]:
indices.insert(0, 0)
if indices[-1] < df_single.shape[0] - 1:
    indices.append(df_single.shape[0])

In [69]:
indices

[0, 264, 273, 382, 725, 730]

In [72]:
df_single['TDATE'] = df_single['TDATE'].astype(str)

In [75]:
type(df_single['TDATE'].iloc[0])

str

In [70]:
import json

In [79]:
df_single.head()

Unnamed: 0,VIN,TDATE,SDATE,ICM_TOTALODOMETER,CCS_CHARGECUR,HCU_BATCHRGDSP,LON84,LAT84,TDIFF
0,LMGGN1S51F1000326,2018-03-29 07:21:55.726000+08:00,20180329,67170.0,,0.0,113.202744,23.151836,10
1,LMGGN1S51F1000326,2018-03-29 07:22:05.726000+08:00,20180329,67170.0,,0.0,113.202763,23.151841,10
2,LMGGN1S51F1000326,2018-03-29 07:22:15.726000+08:00,20180329,67170.0,,0.0,113.202775,23.151841,10
3,LMGGN1S51F1000326,2018-03-29 07:22:25.950000+08:00,20180329,67170.0,,0.0,113.202783,23.151844,10
4,LMGGN1S51F1000326,2018-03-29 07:22:35.950000+08:00,20180329,67170.0,,0.0,113.202788,23.151855,10


In [91]:
res = pd.DataFrame()
for i in range(len(indices) - 1):
    lo, hi = indices[i], indices[i+1]
    df_seg = df_single.iloc[lo:hi, :].copy()
    df_seg['trip'] = i
    res = pd.concat([res, df_seg])

In [92]:
res['trip'].unique()

array([0, 1, 2, 3, 4])

In [93]:
res = res[['VIN','TDATE','LAT84','LON84','trip']].copy()
res.head()

Unnamed: 0,VIN,TDATE,LAT84,LON84,trip
0,LMGGN1S51F1000326,2018-03-29 07:21:55.726000+08:00,23.151836,113.202744,0
1,LMGGN1S51F1000326,2018-03-29 07:22:05.726000+08:00,23.151841,113.202763,0
2,LMGGN1S51F1000326,2018-03-29 07:22:15.726000+08:00,23.151841,113.202775,0
3,LMGGN1S51F1000326,2018-03-29 07:22:25.950000+08:00,23.151844,113.202783,0
4,LMGGN1S51F1000326,2018-03-29 07:22:35.950000+08:00,23.151855,113.202788,0


In [94]:
res['TDATE'] = res['TDATE'].apply(lambda x: x.split('+')[0])
res.head()

Unnamed: 0,VIN,TDATE,LAT84,LON84,trip
0,LMGGN1S51F1000326,2018-03-29 07:21:55.726000,23.151836,113.202744,0
1,LMGGN1S51F1000326,2018-03-29 07:22:05.726000,23.151841,113.202763,0
2,LMGGN1S51F1000326,2018-03-29 07:22:15.726000,23.151841,113.202775,0
3,LMGGN1S51F1000326,2018-03-29 07:22:25.950000,23.151844,113.202783,0
4,LMGGN1S51F1000326,2018-03-29 07:22:35.950000,23.151855,113.202788,0


In [95]:
res.shape

(730, 5)

In [96]:
res.to_csv('res.csv', index=False)