In [1]:
import pandas as pd
import numpy as np

In [2]:
car_df = pd.read_csv("./archive/v2.csv")
car_df.shape

  car_df = pd.read_csv("./archive/v2.csv")


(3120272, 17)

In [3]:
car_df.head()

Unnamed: 0,tripID,deviceID,timeStamp,accData,gps_speed,battery,cTemp,dtc,eLoad,iat,imap,kpl,maf,rpm,speed,tAdv,tPos
0,1,0.0,2017-12-22 18:43:05,10c0f8e00448fa18c80515d30000000000000000000000...,24.2612,0.0,66.0,0.0,28.6275,40.0,97.0,0.0,0.0,1010.75,23.0,0.0,0.0
1,1,0.0,2017-12-22 18:43:06,1138f8c804780a1ebdf718bcf919d10617c8e301b31017...,23.15,0.0,66.0,0.0,33.7255,40.0,98.0,0.0,0.0,815.5,21.0,0.0,0.0
2,1,0.0,2017-12-22 18:43:07,10f0f89804480612c30010c30714ce0520b7f41dbdf118...,18.7052,0.0,66.0,0.0,43.1373,40.0,98.0,0.0,0.0,862.25,17.0,0.0,0.0
3,1,0.0,2017-12-22 18:43:08,10d0f84804480d15bd0210c9f822c80017caf81ccd0517...,16.4828,0.0,66.0,0.0,41.5686,40.0,97.0,0.0,0.0,817.0,17.0,0.0,0.0
4,1,0.0,2017-12-22 18:43:09,1090f8c80480041dc9081cc50815c60511c60112c40514...,17.4088,0.0,66.0,0.0,43.1373,40.0,97.0,0.0,0.0,804.25,15.0,0.0,0.0


### Rename raw columns to improve readability

In [4]:
column_rename_dict = {
    'tripID': 'trip_id',
    'deviceID': 'device_id',
    'timeStamp': 'timestamp',
    'accData': 'accelerometer',
    'battery': 'battery_volt',
    'cTemp': 'coolant_temp',
    'dtc': 'diagnostic_trouble_code',
    'eLoad': 'engine_load',
    'iat': 'intake_air_temp',
    'maf': 'air_mass_rate',
    'rpm': 'revolutions_per_min',
    'tAdv': 'timing_advance',
    'tPos': 'throttle_position',
}

car_df = car_df.rename(columns=column_rename_dict)

### Data Dictionary

**Basic Information**

| Column | Description |
| :--- | :----------- |
| device_id | A device id is a car. |
| trip_id | One id is 1 trip. A trip begins when car engine is switched on and ends when the engine is switched off. |
| timestamp | Data collection time. |
| battery_volt | The battery voltage corresponds to voltage of the battery installed in Car, which supplies electrical energy to a motor vehicle. |
| diagnostic_trouble_code | Number of diagnostic trouble codes. DTC's, or Diagnostic Trouble Codes, are used by automobile manufacturers to diagnose problems related to the vehicle. |
| accelerometer | Accelerometer and Magnetometer sensor data. The data is collected from the OBD device. Values are in terms of G-force. The data is across X, Y, Z axis where X-axis is horizontal, Y-axis is vertical, and Z-axis is the direction of movement of the car. |


**Speed related sensors**

| Column | Description |
| :--- | :----------- |
| gps_speed | The speed in kmph (kilometers per hour) as noted from GPS sensor. |
| speed | Speed data as collected from OBD device mounted in the car. |
| kpl | KMPL is mileage in kilometres per litre. It is a derived metric derived from speed and fuel to air mass flow ratio. This ratio is constant in case of Petrol cars while changes for other Fuel types. Hence, the KMPL value is accurate for petrol cars, and contain some error in case of other fuel types. |
| revolutions_per_min | engine RPM. The number of turns in one minute. |


**Temperature related sensors**

| Column | Description |
| :--- | :----------- |
| coolant_temp | The Temperature of the engine coolant of an internal combustion engine. The normal operating temperature for most engines is in a range of 90 to 104 degree Celsius (195 to 220 degrees Fahrenheit). |
| intake_air_temp | The Intake Air Temperature sensor (IAT) has been utilised as an Engine Control Unit (ECU) input signal, as a requirement for calculating the Air Mass volume for the incoming air charge. This is, to assist in determining the correct engine fuel requirement to suit the operating air temperature. |
| timing_advance | Timing advance refers to the number of degrees before top dead center (BTDC) that the spark will ignite the air-fuel mixture in the combustion chamber during the compression stroke. |


**Air mass flow related sensors**

| Column | Description |
| :--- | :----------- |
| imap | The manifold absolute pressure sensor (MAP sensor) is one of the sensors used in an internal combustion engine's electronic control system. The MAP sensor sensor provides instantaneous manifold pressure information to the engine's electronic control unit (ECU). The data is used to calculate air density and determine the engine's air mass flow rate, which in turn determines the required fuel metering for optimum combustion (see stoichiometry) and influence the advance or retard of ignition timing. |
| engine_load | Engine load measures how much air (and fuel) you're sucking into the engine and then compares that value to the theoretical maximum. |
| throttle_position | A throttle position sensor (TPS) is a sensor used to monitor the air intake of an engine. |
| air_mass_rate | A mass (air) flow sensor (MAF) is used to find out the mass flow rate of air entering a fuel-injected internal combustion engine. The air mass information is necessary for the engine control unit (ECU) to balance and deliver the correct fuel mass to the engine. |

### Noisy points

In [5]:
noise_index = car_df["device_id"] == "deviceID"
car_df[noise_index].head()

Unnamed: 0,trip_id,device_id,timestamp,accelerometer,gps_speed,battery_volt,coolant_temp,diagnostic_trouble_code,engine_load,intake_air_temp,imap,kpl,air_mass_rate,revolutions_per_min,speed,timing_advance,throttle_position
106461,tripID,deviceID,timeStamp,accData,gps_speed,battery,cTemp,dtc,eLoad,iat,imap,kpl,maf,rpm,speed,tAdv,tPos
376682,tripID,deviceID,timeStamp,accData,gps_speed,battery,cTemp,dtc,eLoad,iat,imap,kpl,maf,rpm,speed,tAdv,tPos
630522,tripID,deviceID,timeStamp,accData,gps_speed,battery,cTemp,dtc,eLoad,iat,imap,kpl,maf,rpm,speed,tAdv,tPos
631504,tripID,deviceID,timeStamp,accData,gps_speed,battery,cTemp,dtc,eLoad,iat,imap,kpl,maf,rpm,speed,tAdv,tPos
676250,tripID,deviceID,timeStamp,accData,gps_speed,battery,cTemp,dtc,eLoad,iat,imap,kpl,maf,rpm,speed,tAdv,tPos


In [6]:
# Remove noise
car_df = car_df[~noise_index]
print("Data Size after removing noise: {}".format(car_df.shape))

Data Size after removing noise: (3120240, 17)


### Data type casting

In [7]:
first_datatype_casting = {
    'device_id': np.float32,
    'trip_id': np.float32,
    'timestamp': "datetime64[ns]",
    'battery_volt': np.float32,
    'diagnostic_trouble_code': np.float32,
    'gps_speed': np.float32,
    'speed': np.float32,
    'kpl': np.float32,
    'revolutions_per_min': np.float32,
    'coolant_temp': np.float32,
    'intake_air_temp': np.float32,
    'timing_advance': np.float32,
    'imap': np.float32,
    'engine_load': np.float32,
    'throttle_position': np.float32,
    'air_mass_rate': np.float32,
}

# Cast integers for memory efficiency
second_datatype_casting = {
    'device_id': np.int8,
    'trip_id': np.int8,
    'diagnostic_trouble_code': np.int8,
    'speed': np.int8,
    'coolant_temp': np.int8,
    'intake_air_temp': np.int8,
    'imap': np.int8,
}

car_df = car_df.astype(first_datatype_casting).astype(second_datatype_casting)
car_df.dtypes

trip_id                              int8
device_id                            int8
timestamp                  datetime64[ns]
accelerometer                      object
gps_speed                         float32
battery_volt                      float32
coolant_temp                         int8
diagnostic_trouble_code              int8
engine_load                       float32
intake_air_temp                      int8
imap                                 int8
kpl                               float32
air_mass_rate                     float32
revolutions_per_min               float32
speed                                int8
timing_advance                    float32
throttle_position                 float32
dtype: object

### Duplicates

In [8]:
print("Duplicate Rate: {:,.3f}%".format(np.sum(car_df.duplicated()) / car_df.shape[0] * 100))

# Drop all duplicates
car_df = car_df.drop_duplicates()
print("Data Size after removing duplicates: {}".format(car_df.shape))

Duplicate Rate: 26.180%
Data Size after removing duplicates: (2303362, 17)


### Missing Values

In [9]:
car_df.isnull().values.any()

False

### Timestamp Range

In [10]:
car_df['timestamp'].min(), car_df['timestamp'].max()

(Timestamp('2017-11-18 16:23:30'), Timestamp('2018-01-31 23:18:50'))

### Add day driving / night drive labels 

In [11]:
car_df['timestamp'] = pd.to_datetime(car_df['timestamp'])

In [12]:
night_driving_index = (car_df['timestamp'].dt.hour >= 23) | (car_df['timestamp'].dt.hour <= 4)
day_driving_index = ~night_driving_index

print("night driving pct: {:,.3f}%\nday driving pct: {:,.3f}%".format(
    sum(night_driving_index) / car_df.shape[0] * 100,
    sum(day_driving_index) / car_df.shape[0] * 100))

car_df.loc[night_driving_index, 'night_driving'] = 1
car_df.loc[day_driving_index, 'night_driving'] = 0

night driving pct: 0.778%
day driving pct: 99.222%


In [13]:
car_df['device_id'].value_counts()

12    518906
10    495943
9     312916
5     267328
0     184242
7     172654
3     147706
16     83043
8      43185
2      27155
6      21224
1      10272
4       9878
11      7929
14       981
Name: device_id, dtype: int64

### Check sensor frequency to see if any missing signal

#### Behavior Analysis
- trips completed per user
- total drive time per user
- drive time per trip
- trip distance
- trip history?


- day driving time vs. night driving time (night time can be 11pm to 4am)
- day driving is safter than night driving


- hardbreaks
- rash acceleration (hard accelerate)
- drive speed (km/hr), overspeeding
- idling with engine on (idling on a public road is dangerous. idling is not environmental friendly and burns the same amount of oil as driving.)

#### Customer Segmentation
Build driving behavior features for clustering, and analyse the commonality in each group. But customer profiling with more meaningful background information can make features more representative.

Beside, we don't have any customer information in the data.

#### Customer Safety Analysis
Some customers are more risky while driving. It causes a high chance to car accidents. It impacts car availability.

#### Car Usage Analysis
- what OBD data can tell?
- 