In [1]:
import pandas as pd
import numpy as np

In [2]:
car_df = pd.read_csv("./archive/v2.csv")
car_df.shape

  car_df = pd.read_csv("./archive/v2.csv")


(3120272, 17)

In [3]:
car_df.head()

Unnamed: 0,tripID,deviceID,timeStamp,accData,gps_speed,battery,cTemp,dtc,eLoad,iat,imap,kpl,maf,rpm,speed,tAdv,tPos
0,1,0.0,2017-12-22 18:43:05,10c0f8e00448fa18c80515d30000000000000000000000...,24.2612,0.0,66.0,0.0,28.6275,40.0,97.0,0.0,0.0,1010.75,23.0,0.0,0.0
1,1,0.0,2017-12-22 18:43:06,1138f8c804780a1ebdf718bcf919d10617c8e301b31017...,23.15,0.0,66.0,0.0,33.7255,40.0,98.0,0.0,0.0,815.5,21.0,0.0,0.0
2,1,0.0,2017-12-22 18:43:07,10f0f89804480612c30010c30714ce0520b7f41dbdf118...,18.7052,0.0,66.0,0.0,43.1373,40.0,98.0,0.0,0.0,862.25,17.0,0.0,0.0
3,1,0.0,2017-12-22 18:43:08,10d0f84804480d15bd0210c9f822c80017caf81ccd0517...,16.4828,0.0,66.0,0.0,41.5686,40.0,97.0,0.0,0.0,817.0,17.0,0.0,0.0
4,1,0.0,2017-12-22 18:43:09,1090f8c80480041dc9081cc50815c60511c60112c40514...,17.4088,0.0,66.0,0.0,43.1373,40.0,97.0,0.0,0.0,804.25,15.0,0.0,0.0


### Rename raw columns to improve readability

In [4]:
column_rename_dict = {
    'tripID': 'trip_id',
    'deviceID': 'device_id',
    'timeStamp': 'timestamp',
    'accData': 'accelerometer',
    'battery': 'battery_volt',
    'cTemp': 'coolant_temp',
    'dtc': 'diagnostic_trouble_code',
    'eLoad': 'engine_load',
    'iat': 'intake_air_temp',
    'maf': 'air_mass_rate',
    'rpm': 'revolutions_per_min',
    'tAdv': 'timing_advance',
    'tPos': 'throttle_position',
}

car_df = car_df.rename(columns=column_rename_dict)

### Data Dictionary

**Basic Information**

| Column | Description |
| :--- | :----------- |
| device_id | A device id is a car. |
| trip_id | One id is 1 trip. A trip begins when car engine is switched on and ends when the engine is switched off. |
| timestamp | Data collection time. |
| battery_volt | The battery voltage corresponds to voltage of the battery installed in Car, which supplies electrical energy to a motor vehicle. |
| diagnostic_trouble_code | Number of diagnostic trouble codes. DTC's, or Diagnostic Trouble Codes, are used by automobile manufacturers to diagnose problems related to the vehicle. |
| accelerometer | Accelerometer and Magnetometer sensor data. The data is collected from the OBD device. Values are in terms of G-force. The data is across X, Y, Z axis where X-axis is horizontal, Y-axis is vertical, and Z-axis is the direction of movement of the car. |


**Speed related sensors**

| Column | Description |
| :--- | :----------- |
| gps_speed | The speed in kmph (kilometers per hour) as noted from GPS sensor. |
| speed | Speed data as collected from OBD device mounted in the car. |
| kpl | KMPL is mileage in kilometres per litre. It is a derived metric derived from speed and fuel to air mass flow ratio. This ratio is constant in case of Petrol cars while changes for other Fuel types. Hence, the KMPL value is accurate for petrol cars, and contain some error in case of other fuel types. |
| revolutions_per_min | engine RPM. The number of turns in one minute. |


**Temperature related sensors**

| Column | Description |
| :--- | :----------- |
| coolant_temp | The Temperature of the engine coolant of an internal combustion engine. The normal operating temperature for most engines is in a range of 90 to 104 degree Celsius (195 to 220 degrees Fahrenheit). |
| intake_air_temp | The Intake Air Temperature sensor (IAT) has been utilised as an Engine Control Unit (ECU) input signal, as a requirement for calculating the Air Mass volume for the incoming air charge. This is, to assist in determining the correct engine fuel requirement to suit the operating air temperature. |
| timing_advance | Timing advance refers to the number of degrees before top dead center (BTDC) that the spark will ignite the air-fuel mixture in the combustion chamber during the compression stroke. |


**Air mass flow related sensors**

| Column | Description |
| :--- | :----------- |
| imap | The manifold absolute pressure sensor (MAP sensor) is one of the sensors used in an internal combustion engine's electronic control system. The MAP sensor sensor provides instantaneous manifold pressure information to the engine's electronic control unit (ECU). The data is used to calculate air density and determine the engine's air mass flow rate, which in turn determines the required fuel metering for optimum combustion (see stoichiometry) and influence the advance or retard of ignition timing. |
| engine_load | Engine load measures how much air (and fuel) you're sucking into the engine and then compares that value to the theoretical maximum. |
| throttle_position | A throttle position sensor (TPS) is a sensor used to monitor the air intake of an engine. |
| air_mass_rate | A mass (air) flow sensor (MAF) is used to find out the mass flow rate of air entering a fuel-injected internal combustion engine. The air mass information is necessary for the engine control unit (ECU) to balance and deliver the correct fuel mass to the engine. |

### Noisy points

In [5]:
noise_index = car_df["device_id"] == "deviceID"
car_df[noise_index].head()

Unnamed: 0,trip_id,device_id,timestamp,accelerometer,gps_speed,battery_volt,coolant_temp,diagnostic_trouble_code,engine_load,intake_air_temp,imap,kpl,air_mass_rate,revolutions_per_min,speed,timing_advance,throttle_position
106461,tripID,deviceID,timeStamp,accData,gps_speed,battery,cTemp,dtc,eLoad,iat,imap,kpl,maf,rpm,speed,tAdv,tPos
376682,tripID,deviceID,timeStamp,accData,gps_speed,battery,cTemp,dtc,eLoad,iat,imap,kpl,maf,rpm,speed,tAdv,tPos
630522,tripID,deviceID,timeStamp,accData,gps_speed,battery,cTemp,dtc,eLoad,iat,imap,kpl,maf,rpm,speed,tAdv,tPos
631504,tripID,deviceID,timeStamp,accData,gps_speed,battery,cTemp,dtc,eLoad,iat,imap,kpl,maf,rpm,speed,tAdv,tPos
676250,tripID,deviceID,timeStamp,accData,gps_speed,battery,cTemp,dtc,eLoad,iat,imap,kpl,maf,rpm,speed,tAdv,tPos


In [6]:
# Remove noise
car_df = car_df[~noise_index]
print("Data Size after removing noise: {}".format(car_df.shape))

Data Size after removing noise: (3120240, 17)


### Data type casting

In [7]:
first_datatype_casting = {
    'device_id': np.float32,
    'trip_id': np.float32,
    'timestamp': "datetime64[ns]",
    'battery_volt': np.float32,
    'diagnostic_trouble_code': np.float32,
    'accelerometer': str,
    'gps_speed': np.float32,
    'speed': np.float32,
    'kpl': np.float32,
    'revolutions_per_min': np.float32,
    'coolant_temp': np.float32,
    'intake_air_temp': np.float32,
    'timing_advance': np.float32,
    'imap': np.float32,
    'engine_load': np.float32,
    'throttle_position': np.float32,
    'air_mass_rate': np.float32,
}

# Cast integers for memory efficiency
second_datatype_casting = {
    'device_id': np.int8,
    'trip_id': np.int8,
    'diagnostic_trouble_code': np.int8,
    'speed': np.int8,
    'coolant_temp': np.int8,
    'intake_air_temp': np.int8,
    'imap': np.int8,
}

car_df = car_df.astype(first_datatype_casting).astype(second_datatype_casting)
car_df.dtypes

trip_id                              int8
device_id                            int8
timestamp                  datetime64[ns]
accelerometer                      object
gps_speed                         float32
battery_volt                      float32
coolant_temp                         int8
diagnostic_trouble_code              int8
engine_load                       float32
intake_air_temp                      int8
imap                                 int8
kpl                               float32
air_mass_rate                     float32
revolutions_per_min               float32
speed                                int8
timing_advance                    float32
throttle_position                 float32
dtype: object

### Duplicates
Duplicate signals might be useful.

In [8]:
print("Duplicate Rate: {:,.3f}%".format(np.sum(car_df.duplicated()) / car_df.shape[0] * 100))

# Drop all duplicates
car_df = car_df.drop_duplicates()
print("Data Size after removing duplicates: {}".format(car_df.shape))

Duplicate Rate: 26.543%
Data Size after removing duplicates: (2292031, 17)


### Missing Values

In [9]:
car_df.isnull().values.any()

False

### Timestamp Range

In [10]:
car_df['timestamp'].min(), car_df['timestamp'].max()

(Timestamp('2017-11-18 16:23:30'), Timestamp('2018-01-31 23:18:50'))

### Add day driving / night drive labels 

In [11]:
car_df['timestamp'] = pd.to_datetime(car_df['timestamp'])

In [12]:
night_driving_index = (car_df['timestamp'].dt.hour >= 23) | (car_df['timestamp'].dt.hour <= 4)
day_driving_index = ~night_driving_index

print("night driving pct: {:,.3f}%\nday driving pct: {:,.3f}%".format(
    sum(night_driving_index) / car_df.shape[0] * 100,
    sum(day_driving_index) / car_df.shape[0] * 100))

car_df.loc[night_driving_index, 'night_driving'] = 1
car_df.loc[day_driving_index, 'night_driving'] = 0

night driving pct: 0.782%
day driving pct: 99.218%


In [13]:
car_df['device_id'].value_counts()

12    518906
10    484612
9     312916
5     267328
0     184242
7     172654
3     147706
16     83043
8      43185
2      27155
6      21224
1      10272
4       9878
11      7929
14       981
Name: device_id, dtype: int64

### Check signal frequency for each sensor
In over 99% of the time, each sensor had only 1 unique signal. It can be considered as 1 Hz.

This means 1 timestamp can be considered as 1 time step.
It requires signal preprocessing to only keep the last signal for each timestamp.

In [14]:
# Calculate the signal frequency of each sensor (number of unique values per timestamp)
for c in ['accelerometer', 'gps_speed',
       'battery_volt', 'coolant_temp', 'diagnostic_trouble_code',
       'engine_load', 'intake_air_temp', 'imap', 'kpl', 'air_mass_rate',
       'revolutions_per_min', 'speed', 'timing_advance', 'throttle_position',]:
    
    signal_frequency = car_df[['device_id', 'trip_id', 'timestamp', c]]\
        .drop_duplicates().groupby(['device_id', 'trip_id', 'timestamp']).count()

    print("{} in {:,.3f}% time had 1 signal per second (1 Hz)".format(
        c, 
        signal_frequency[signal_frequency[c] == 1].shape[0] / signal_frequency.shape[0] * 100))

accelerometer in 99.226% time had 1 signal per second (1 Hz)
gps_speed in 100.000% time had 1 signal per second (1 Hz)
battery_volt in 99.934% time had 1 signal per second (1 Hz)
coolant_temp in 99.960% time had 1 signal per second (1 Hz)
diagnostic_trouble_code in 100.000% time had 1 signal per second (1 Hz)
engine_load in 99.651% time had 1 signal per second (1 Hz)
intake_air_temp in 99.972% time had 1 signal per second (1 Hz)
imap in 99.757% time had 1 signal per second (1 Hz)
kpl in 99.697% time had 1 signal per second (1 Hz)
air_mass_rate in 99.691% time had 1 signal per second (1 Hz)
revolutions_per_min in 99.625% time had 1 signal per second (1 Hz)
speed in 99.745% time had 1 signal per second (1 Hz)
timing_advance in 99.934% time had 1 signal per second (1 Hz)
throttle_position in 99.879% time had 1 signal per second (1 Hz)


### Remove invalid accelerometer code
Accelerometer code should have 162 characters.  

In [15]:
car_df['accelerometer'].str.len().value_counts()

162    1964156
1       327875
Name: accelerometer, dtype: int64

In [16]:
car_df = car_df[car_df['accelerometer'].str.len() == 162]
car_df.shape

(1964156, 18)

### Decode Accelerometer Hex code

In [17]:
from decoding_acc import decode_acc_hex

decode_acc_hex_df = decode_acc_hex(car_df['accelerometer'], car_df['speed'])
car_df.loc[:, 'acc_x'] = decode_acc_hex_df['acc_x'].values
car_df.loc[:, 'acc_y'] = decode_acc_hex_df['acc_y'].values
car_df.loc[:, 'acc_z'] = decode_acc_hex_df['acc_z'].values

100%|██████████████████████████████████████████████████████████████████████████| 1964156/1964156 [02:50<00:00, 11509.96it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 638.73it/s]


In [18]:
car_df[car_df['acc_x'].isnull()]

Unnamed: 0,trip_id,device_id,timestamp,accelerometer,gps_speed,battery_volt,coolant_temp,diagnostic_trouble_code,engine_load,intake_air_temp,...,kpl,air_mass_rate,revolutions_per_min,speed,timing_advance,throttle_position,night_driving,acc_x,acc_y,acc_z


In [19]:
car_df

Unnamed: 0,trip_id,device_id,timestamp,accelerometer,gps_speed,battery_volt,coolant_temp,diagnostic_trouble_code,engine_load,intake_air_temp,...,kpl,air_mass_rate,revolutions_per_min,speed,timing_advance,throttle_position,night_driving,acc_x,acc_y,acc_z
0,1,0,2017-12-22 18:43:05,10c0f8e00448fa18c80515d30000000000000000000000...,24.2612,0.0,66,0,28.627501,40,...,0.0,0.0,1010.75,23,0.0,0.0,0.0,0.001720,0.005934,-0.068833
1,1,0,2017-12-22 18:43:06,1138f8c804780a1ebdf718bcf919d10617c8e301b31017...,23.1500,0.0,66,0,33.725498,40,...,0.0,0.0,815.50,21,0.0,0.0,0.0,-0.056164,-0.004535,-1.083196
2,1,0,2017-12-22 18:43:07,10f0f89804480612c30010c30714ce0520b7f41dbdf118...,18.7052,0.0,66,0,43.137299,40,...,0.0,0.0,862.25,17,0.0,0.0,0.0,0.005041,0.067211,-1.015935
3,1,0,2017-12-22 18:43:08,10d0f84804480d15bd0210c9f822c80017caf81ccd0517...,16.4828,0.0,66,0,41.568600,40,...,0.0,0.0,817.00,17,0.0,0.0,0.0,0.037502,-0.011481,-1.003691
4,1,0,2017-12-22 18:43:09,1090f8c80480041dc9081cc50815c60511c60112c40514...,17.4088,0.0,66,0,43.137299,40,...,0.0,0.0,804.25,15,0.0,0.0,0.0,0.073495,-0.047171,-1.000528
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3120267,-101,9,2018-01-31 20:34:26,07b80478fe28d8dfdcd8deded7dddcd8dddcd8dedcd8de...,5.0004,0.0,0,0,0.000000,0,...,0.0,0.0,0.00,0,0.0,0.0,0.0,-0.800208,-0.446101,-0.358420
3120268,-101,9,2018-01-31 20:34:27,07700488fe30d9deddd9deddd9dedcd8deddd9dedcd8df...,5.0004,0.0,0,0,0.000000,0,...,0.0,0.0,0.00,0,0.0,0.0,0.0,-0.799571,-0.442405,-0.364975
3120269,-101,9,2018-01-31 20:34:28,08000488fdf8d9dedbd7dfdcd8dddcd9dddcd9dcddd9de...,5.0004,0.0,0,0,0.000000,0,...,0.0,0.0,0.00,0,0.0,0.0,0.0,-0.802094,-0.447758,-0.359002
3120270,-101,9,2018-01-31 20:34:29,07d004a0fe58d9deddd9dedbd8dedcd9dfdbd8dfdcdade...,5.0004,0.0,0,0,0.000000,0,...,0.0,0.0,0.00,0,0.0,0.0,0.0,-0.807179,-0.443288,-0.357273


In [20]:
car_df[car_df.duplicated()]

Unnamed: 0,trip_id,device_id,timestamp,accelerometer,gps_speed,battery_volt,coolant_temp,diagnostic_trouble_code,engine_load,intake_air_temp,...,kpl,air_mass_rate,revolutions_per_min,speed,timing_advance,throttle_position,night_driving,acc_x,acc_y,acc_z


# trip_id can be negative!!!!

In [23]:
trip_ids = car_df['trip_id'].unique()

for i in trip_ids:
    if i < 0 and (i*-1) not in trip_ids:
        print(i)

-128


In [24]:
car_df[car_df['trip_id'] < 0].shape[0] / car_df.shape[0]

0.27029930412859265

In [16]:
mappings = car_df[['device_id', 'trip_id']].reset_index().groupby(['device_id', 'trip_id']).count().reset_index()
mappings

Unnamed: 0,device_id,trip_id,index
0,0,1,3467
1,0,2,1630
2,0,3,1153
3,0,4,2054
4,0,5,5464
...,...,...,...
1475,16,39,498
1476,16,40,192
1477,16,41,379
1478,16,42,553


In [17]:
mappings[(mappings['trip_id'].isin([15, -15]))]

Unnamed: 0,device_id,trip_id,index
14,0,15,1864
114,2,15,192
189,3,15,664
425,5,-15,637
455,5,15,864
582,6,15,960
601,7,15,733
727,8,15,1561
797,9,15,3150
1023,10,-15,1646


In [18]:
trip_hours = car_df[(car_df['device_id'] == 5) & (car_df['trip_id'] == 15)]
trip_hours['hour'] = trip_hours['timestamp'].astype(str).str.slice(8, 10) + '_' + trip_hours['timestamp'].dt.hour.astype(str)
trip_hours['hour'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trip_hours['hour'] = trip_hours['timestamp'].astype(str).str.slice(8, 10) + '_' + trip_hours['timestamp'].dt.hour.astype(str)


19_17    657
02_10    206
02_8       1
Name: hour, dtype: int64

In [19]:
trip_hours[trip_hours["hour"] == '19_17']

Unnamed: 0,trip_id,device_id,timestamp,accelerometer,gps_speed,battery_volt,coolant_temp,diagnostic_trouble_code,engine_load,intake_air_temp,imap,kpl,air_mass_rate,revolutions_per_min,speed,timing_advance,throttle_position,night_driving,hour
2749006,15,5,2018-01-19 17:02:13,0b1000a80520fd113f0412440000000000000000000000...,20.742399,0.0,74,0,34.117599,47,115,3.1694,20.969999,1463.50,22,0.0,88.627502,0.0,19_17
2749007,15,5,2018-01-19 17:02:19,0b0800c005a0ff1247f70d4f00103109113806153b0714...,14.630800,0.0,75,0,1.176500,46,120,7.1250,12.720000,1138.75,30,0.0,88.627502,0.0,19_17
2749008,15,5,2018-01-19 17:02:20,0aa00080058007144004193bf61447020b3c021c310116...,11.297200,0.0,75,0,22.745100,46,110,7.1250,12.720000,1330.00,30,0.0,88.627502,0.0,19_17
2749009,15,5,2018-01-19 17:02:21,0ab800d80628fc1c37061a48070c41ff0c4dfe102c020a...,12.408400,0.0,75,0,0.000000,45,108,5.3914,16.809999,1238.00,30,0.0,88.627502,0.0,19_17
2749010,15,5,2018-01-19 17:02:22,0ae8015805f0000d44030c41021042030e46fe11440014...,10.926800,0.0,75,0,24.705900,45,105,5.8387,13.970000,1238.00,27,0.0,88.627502,0.0,19_17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2749662,15,5,2018-01-19 17:13:16,0d38fd8005e804133f03123e03154104143f04133e0313...,0.000000,0.0,0,0,0.000000,0,0,0.0000,0.000000,0.00,0,0.0,0.000000,0.0,19_17
2749663,15,5,2018-01-19 17:13:17,0d40fd8805d803143f04133d04133f04133e04133f0413...,0.000000,0.0,0,0,0.000000,0,0,0.0000,0.000000,0.00,0,0.0,0.000000,0.0,19_17
2749664,15,5,2018-01-19 17:13:18,0d48fdd805b804143f04143e03133e04133f03123f0314...,0.000000,0.0,0,0,0.000000,0,0,0.0000,0.000000,0.00,0,0.0,0.000000,0.0,19_17
2749665,15,5,2018-01-19 17:13:19,0d18fdb805b804134002123e04143e04143e02123d0313...,0.000000,0.0,0,0,0.000000,0,0,0.0000,0.000000,0.00,0,0.0,0.000000,0.0,19_17


In [20]:
trip_hours = car_df[(car_df['device_id'] == 5) & (car_df['trip_id'] == -15)]
trip_hours['hour'] = trip_hours['timestamp'].astype(str).str.slice(8, 10) + '_' + trip_hours['timestamp'].dt.hour.astype(str)
trip_hours

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trip_hours['hour'] = trip_hours['timestamp'].astype(str).str.slice(8, 10) + '_' + trip_hours['timestamp'].dt.hour.astype(str)


Unnamed: 0,trip_id,device_id,timestamp,accelerometer,gps_speed,battery_volt,coolant_temp,diagnostic_trouble_code,engine_load,intake_air_temp,imap,kpl,air_mass_rate,revolutions_per_min,speed,timing_advance,throttle_position,night_driving,hour
2722485,-15,5,2018-01-17 12:10:14,0b8800e80580f7043801083b0000000000000000000000...,18.890400,0.0,46,0,42.745098,34,108,3.3363,18.110001,1331.75,20,0.0,89.0196,0.0,17_12
2722486,-15,5,2018-01-17 12:10:15,0b3000a80598031244fd1045010946fb0d42fa0d4b040e...,20.742399,0.0,46,0,61.568600,33,122,3.0387,23.860001,1595.50,24,0.0,89.0196,0.0,17_12
2722487,-15,5,2018-01-17 12:10:16,0b6000f00560fb1a4003124bfd0a45f90b3ef5063f060b...,21.853600,0.0,46,0,9.019600,33,106,6.8472,11.030000,1038.25,25,0.0,89.0196,0.0,17_12
2722488,-15,5,2018-01-17 12:10:17,0b8801080568060c41f91342fd1241000e49f71047ff0f...,25.002001,0.0,46,0,42.745098,33,103,5.7726,12.560000,1132.00,24,0.0,89.0196,0.0,17_12
2722489,-15,5,2018-01-17 12:10:18,0b2000f005a0020645fb194600123e02152d041a34ff16...,24.816799,0.0,46,0,40.392200,33,103,5.7259,13.190000,1141.75,25,0.0,89.0196,0.0,17_12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2723117,-15,5,2018-01-17 12:20:58,08d0fed00568fd103ffe1141fd0f3efd103efc113ffd11...,0.000000,0.0,0,0,0.000000,0,0,0.0000,0.000000,0.00,0,0.0,0.0000,0.0,17_12
2723118,-15,5,2018-01-17 12:20:59,0898fed00578fd103ffd1240fc103ffd1041fd1140fd10...,0.000000,0.0,0,0,0.000000,0,0,0.0000,0.000000,0.00,0,0.0,0.0000,0.0,17_12
2723119,-15,5,2018-01-17 12:21:00,08b8ff080570fc123dfd103ffd103ffd0f43fd1040fc0f...,0.000000,0.0,0,0,0.000000,0,0,0.0000,0.000000,0.00,0,0.0,0.0000,0.0,17_12
2723120,-15,5,2018-01-17 12:21:01,08e0ff180588fc0f40fc133ffd1140fd103ffe1042fd11...,0.000000,0.0,0,0,0.000000,0,0,0.0000,0.000000,0.00,0,0.0,0.0000,0.0,17_12


In [21]:
car_df['diagnostic_trouble_code'].value_counts()

 0      2291960
-127         69
 67           2
Name: diagnostic_trouble_code, dtype: int64

# Check any timestamp missing for each trip?

#### Behavior Analysis
- trips completed per user
- total drive time per user
- drive time per trip
- trip distance
- trip history?


- day driving time vs. night driving time (night time can be 11pm to 4am)
- day driving is safter than night driving


- hardbreaks
- rash acceleration (hard accelerate)
- drive speed (km/hr), overspeeding
- idling with engine on (idling on a public road is dangerous. idling is not environmental friendly and burns the same amount of oil as driving.)

#### Customer Segmentation
Build driving behavior features for clustering, and analyse the commonality in each group. But customer profiling with more meaningful background information can make features more representative.

Beside, we don't have any customer information in the data.

#### Customer Safety Analysis
Some customers are more risky while driving. It causes a high chance to car accidents. It impacts car availability.

#### Car Usage Analysis
- what OBD data can tell?
- 