# Includes

In [30]:
import pandas as pd
from sklearn.cluster import KMeans

from matplotlib import pyplot as plt
%matplotlib inline

import os
import re
from datetime import datetime, timedelta
import numpy as np
from math import ceil, floor

# Load data

In [31]:
root_path = ".."
data_dir = os.path.join(root_path, "unpacked")

data_files = os.listdir(data_dir)  # target files
data_files = [x for x in data_files if x != 'readme.txt']  # remove readme


def file_to_df(file):
    """Open file and create pandas data frame"""
    full_path = os.path.join(data_dir, file)
    return pd.read_csv(full_path, sep=';', decimal=',')

In [8]:
print(data_files)  # ensure all files have been listed properly

['vehicle19_fuelLevel_public.csv', 'vehicle19_ingection_public.csv', 'vehicle19_refueling2_public.csv', 'vehicle19_speedAndHeight_public.csv', 'vehicle19_tachometer_public.csv', 'vehicle1_fuelLevel_public.csv', 'vehicle1_ingection_public.csv', 'vehicle1_refueling2_public.csv', 'vehicle1_speedAndHeight_public.csv', 'vehicle1_tachometer_public.csv', 'vehicle28_fuelLevel_public.csv', 'vehicle28_ingection_public.csv', 'vehicle28_refueling2_public.csv', 'vehicle28_speedAndHeight_public.csv', 'vehicle28_tachometer_public.csv', 'vehicle3_fuelLevel_public.csv', 'vehicle3_ingection_public.csv', 'vehicle3_refueling2_public.csv', 'vehicle3_speedAndHeight_public.csv', 'vehicle3_tachometer_public.csv', 'vehicle5_fuelLevel_public.csv', 'vehicle5_ingection_public.csv', 'vehicle5_refueling2_public.csv', 'vehicle5_speedAndHeight_public.csv', 'vehicle5_tachometer_public.csv']


## Get vehicle IDs

In [32]:
regex_pattern = r"vehicle(\d*)"  # ID is integer number comes right after vehicle word
compiled_pattern = re.compile(regex_pattern)
ids = compiled_pattern.findall(''.join(data_files))  # apply pattern to all file names
ids = set(ids)  # get a set of unique numbers
print(ids)  # show all ids

{'28', '19', '3', '1', '5'}


## Load info about a single vehicle

In [33]:
def open_file_id(v_id, key_lexem):
    """Key lexem determines file, for example, 'fuelLevel'"""
    regex_pattern = re.compile(f"(vehicle{v_id}_{key_lexem}" + r"_(\w|\d|_)*\.csv)")  # pattern to find appropriate name
    pattern_match = regex_pattern.search('|'.join(data_files))  # search on a whole file set
    needed_file = pattern_match.group(1)  # the file is found, now we can open it
    return file_to_df(needed_file)
    
possible_lexems = ['fuelLevel', 'ingection', 'refueling2', 'speedAndHeight', 'tachometer']

In [34]:
def load_all_files_id(v_id):
    """Generates a dictionary of all files describing a single vehicle"""
    return {lex: open_file_id(v_id, lex) for lex in possible_lexems}

In [35]:
vehicle_1_dfs = load_all_files_id(1)  # test how it works
for cur_df in vehicle_1_dfs.values():
    print(cur_df)

                     DTIME  BEVALUE
0      2020-01-08 21:16:02      0.0
1      2020-01-08 21:17:04      0.0
2      2020-01-08 21:18:04      0.0
3      2020-01-09 10:05:26     49.7
4      2020-01-09 10:06:27     49.9
...                    ...      ...
19027  2020-06-27 00:48:05     54.0
19028  2020-06-27 00:49:05     54.2
19029  2020-06-27 01:15:14     54.0
19030  2020-06-27 01:16:14     54.2
19031  2020-06-27 01:17:14     54.4

[19032 rows x 2 columns]
                     DTIME  BEVALUE
0      2020-01-08 21:16:02        0
1      2020-01-08 21:17:04        0
2      2020-01-08 21:18:04        0
3      2020-01-09 10:05:26        1
4      2020-01-09 10:06:27        1
...                    ...      ...
85923  2020-06-27 01:10:27        1
85924  2020-06-27 01:15:14        1
85925  2020-06-27 01:15:22        1
85926  2020-06-27 01:16:14        1
85927  2020-06-27 01:17:14        1

[85928 rows x 2 columns]
     TSID            STARTDATE              ENDDATE  STARTLEVEL  ENDLEVEL
0       1 

# Task 5 - drive style recognition

#### 3 classes: 
 - None: vehicle does not move
 - Calm: regular (calm) driving 
 - Aggresive: deviant (or aggressive) driving

In [36]:
raw_data_base = {v_id: load_all_files_id(v_id) for v_id in ids}  # load all data

### Aggregate work intervals - recognize the beginning and the end of each driving session having ingection binary mask

In [37]:
def aggregate_work_intervals(v_id):  # get the list of the time intervals, each interval corresponds to a single driving session 
    ingection_df = raw_data_base[str(v_id)]['ingection']  # get ingection df
    interval_idxs = []  # buffer for the indexes of the beginning and the end of the session
    beg_idx = 0  # the beginning of the session (index)
    beg_found = False  # flag: session is started and not finished
    for index, row in ingection_df.iterrows():
        if row['BEVALUE'] == 0 and beg_found:  # engine was swithced off at the moment 
            interval_idxs.append((beg_idx, index - 1))  # add to answer
            beg_found = False  # reinit counters
            beg_idx = index
        elif row['BEVALUE'] == 1 and not beg_found:  # ingection was switched on at the moment
            beg_idx = index  # remember the moment of the beggining of the session
            beg_found = True
    return [(ingection_df.at[t_idx[0], 'DTIME'], ingection_df.at[t_idx[1], 'DTIME']) for t_idx in interval_idxs]  # indexes -> time

In [38]:
res = aggregate_work_intervals(1)  # test how it works
print(res)

[('2020-01-09 10:05:26', '2020-01-09 10:23:27'), ('2020-01-14 09:34:09', '2020-01-14 09:47:40'), ('2020-01-14 09:55:57', '2020-01-14 10:39:03'), ('2020-01-14 10:40:45', '2020-01-14 11:33:11'), ('2020-01-14 14:31:01', '2020-01-14 14:51:01'), ('2020-01-16 11:01:40', '2020-01-16 13:21:40'), ('2020-01-16 13:25:46', '2020-01-16 13:38:46'), ('2020-01-16 14:56:20', '2020-01-16 14:56:20'), ('2020-01-16 15:05:08', '2020-01-16 15:11:34'), ('2020-01-16 15:18:11', '2020-01-16 15:18:11'), ('2020-01-17 09:05:49', '2020-01-17 12:14:10'), ('2020-01-17 14:45:48', '2020-01-17 15:28:48'), ('2020-01-17 15:35:27', '2020-01-17 16:12:27'), ('2020-01-20 09:44:03', '2020-01-21 17:21:36'), ('2020-01-21 18:01:24', '2020-01-21 18:51:24'), ('2020-01-22 08:06:08', '2020-01-22 16:21:07'), ('2020-01-22 16:24:53', '2020-01-23 10:17:04'), ('2020-01-23 10:22:38', '2020-01-23 12:18:29'), ('2020-01-23 12:24:19', '2020-01-31 15:35:00'), ('2020-01-31 15:46:35', '2020-01-31 15:46:35'), ('2020-02-03 09:28:29', '2020-02-03 11:

### Calculate acceleration at each possible time quantum

#### Acceleration shows the measure of deviance of driving style for some reasons:

* Big absolute acceleration tells driver is aggresive in a part of cases
* Just speed can't always tell whether driving style is aggresive as we don't know speed restrictions for each time quantum
* Acceleration tell more than revs as revs can't tell about hard braking

Acceleration is counted here as km/h per second as drivers measure velocity in km/h and it's representative to count velocity increment per one second

In [70]:
def get_velocity_acceleration(v_id, time_intervals):  # get velocity and acceleration for each moment of each interval
    speed_df = raw_data_base[str(v_id)]['speedAndHeight']  # get df with velocity
    strp_pattern = '%Y-%m-%d %H:%M:%S'  # time format
    
    def cur_beg_end(cur_idx):
        """Get the beggining and the end of the interval in datetime format"""
        cur_beg_s, cur_end_s = time_intervals[cur_idx]
        cur_beg = datetime.strptime(cur_beg_s, strp_pattern)
        cur_end = datetime.strptime(cur_end_s, strp_pattern)
        return cur_beg, cur_end
    
    cur_interval_idx = 0  # init
    cur_beg, cur_end = cur_beg_end(cur_interval_idx)
    time_intervals_len = len(time_intervals)
    ans = []  # 1st dim - session number, 2nd dim - sample number; 2-sized lists [velocity, acceleration]
    cur_session = []
    prev_d_time = 0
    for index, row in speed_df.iterrows():
        d_time = datetime.strptime(row['DTIME'], strp_pattern)  # current row time
        while d_time > cur_end and cur_interval_idx < time_intervals_len:  # ensure current time is earlier than cur session begins
            ans.append(cur_session)  # finish current session
            cur_session = []  # start a new one
            cur_interval_idx += 1  # the next session number
            if cur_interval_idx < time_intervals_len:
                cur_beg, cur_end = cur_beg_end(cur_interval_idx)
        if cur_interval_idx >= time_intervals_len:  # all sessions finished  
            return ans
        if d_time >= cur_beg and d_time <= cur_end:  # current row is in the session
            speed = row['SPEED']
            if prev_d_time == 0:
                duration = 10000  # can't calculate duration so diminish acceleration
            else:
                duration = (d_time - prev_d_time).total_seconds()  # in seconds
            acceleration = (speed - speed_df.at[index - 1, 'SPEED'] if index > 0 else speed) / duration  # km/(h * s)
            # it may be interpret as km/h per second (on how many km/h the velocity increases per a single second)
            cur_session.append([speed, acceleration])  # add to answer
        prev_d_time = d_time
    ans.append(cur_session)  # add last session
    return ans

In [40]:
res2 = get_velocity_acceleration(1, res)  # test how it works

In [41]:
print(res2[2])

[[0, 0.0], [0, 0.0], [0, 0.0], [0, 0.0], [0, 0.0], [0, 0.0], [16, 0.26666666666666666], [34, 1.0], [37, 1.5], [15, -3.142857142857143], [23, 4.0], [29, 1.5], [20, -3.0], [8, -1.5], [23, 3.0], [14, -1.5], [3, -0.15492957746478872], [0, -0.05], [11, 1.0], [14, 0.5], [11, -0.3], [26, 3.0], [41, 5.0], [41, 0.0], [26, -0.7142857142857143], [46, 0.625], [51, 0.7142857142857143], [34, -0.2833333333333333], [55, 1.9090909090909092], [58, 0.6], [36, -1.2222222222222223], [5, -6.2], [20, 0.7142857142857143], [47, 3.0], [56, 1.8], [28, -3.5], [32, 2.0], [22, -1.4285714285714286], [27, 0.4166666666666667], [48, 1.2352941176470589], [33, -0.8823529411764706], [39, 2.0], [15, -0.7058823529411765], [34, 3.1666666666666665], [35, 0.14285714285714285], [14, -0.7], [30, 0.7272727272727273], [30, 0.0], [1, -0.48333333333333334], [14, 0.30952380952380953], [6, -0.4444444444444444], [6, 0.0], [6, 0.0], [11, 0.45454545454545453], [19, 0.6666666666666666], [10, -0.5294117647058824], [5, -0.25], [6, 0.0256410

# Hardcoded decision tree

### Decision tree logic is described below
#### Some weak points of this classifier:

* Velocity is counted rarely, periods are so continuous so positive and negative accelerations smoothes themselves so we need to diminish thresholds
* All this values can vary so much from the optimal thresholds as there are no labels in dataset and "true" thresholds for this situation are not commonly known, values of the thresholds got in "heuristic" way

In [80]:
# tree logic:
# a, v \approx 0 => label 0 -> vehicle is not driven
# a > 6 => label Aggressive -> too big acceleration
# a < -7 => label Aggressive -> braking to hard
# v > 132 => label Aggressive -> too high speed
# v > 40 & v < 80 & |a| > 3 => label Aggressive -> to high acceleration in the city traffic
# otherwise, label Calm

# hardcoded thresholds
v_epsilon = 1
a_epsilon = 0.1
too_big_a = 6
too_least_a = -7
too_big_v = 132
v_middle_left = 40
v_middle_right = 80
a_middle_too_big = 3

# just tree
def decision_tree_classify(velocity, acceleration):
    """Class 0: None, Class 1: Calm, Class 2: Aggressive"""
    __a = acceleration
    __v = velocity
    none_label = 0
    calm_label = 1
    aggressive_label = 2
    if __v <= v_epsilon and __a <= a_epsilon:
        return none_label
    if __a > too_big_a:
        return aggressive_label
    if __a < too_least_a:
        return aggressive_label
    if __v > too_big_v:
        return aggressive_label
    if __v >= v_middle_left and __v <= v_middle_right and abs(__a) >= a_middle_too_big:
        return aggressive_label
    return calm_label

# session classifier
def session_classify(session_array):  # classify the whole session with a single class
    """session_array is the list of np arrays with a single dim sized 2 (velocity, acceleration)"""
    raw_pred = np.zeros(shape=(len(session_array)), dtype=np.uint8)
    for i, dt in enumerate(session_array):
        raw_pred[i] = decision_tree_classify(dt[0], dt[1])
    percentile_threshold = 80  # look at the 20% of data with the highest class value
    return floor(np.percentile(raw_pred, percentile_threshold))

## Decision tree results

In [81]:
def collect_data_sessions():  # collect data to test the model
    all_sessions = []
    cur_idx = 0
    for current_id in ids:
        intervals = aggregate_work_intervals(current_id)
        cur_id_data = get_velocity_acceleration(current_id, intervals)
        for cur_session in cur_id_data:
            all_sessions.append(cur_session)
        cur_idx += 1
        print(f"Id {current_id} completed ({cur_idx / len(ids) * 100}%)")
    return all_sessions

In [82]:
X_test = collect_data_sessions()

Id 28 completed (20.0%)
Id 19 completed (40.0%)
Id 3 completed (60.0%)
Id 1 completed (80.0%)
Id 5 completed (100.0%)


In [83]:
preds = []
for session in X_test:  # classify all sessions and show summary
    pred = session_classify(session)
    preds.append(pred)

preds = np.array(preds)

print("Summary:")
for class_val, class_label in enumerate(["None", "Calm", "Aggressive"]):
    print(f"{class_label} class: {np.count_nonzero(preds == class_val)}")

Summary:
None class: 232
Calm class: 1120
Aggressive class: 1


In [75]:
aggr_id = np.argmax(preds == 2)
print(X_test[aggr_id])

[[227, 22.7], [152, -5.0]]


# End to end classification
## Tell driving style and time of each session given source files

In [84]:
df_seed = {'time': [], 'vehicle_id': [], 'class': []}  # data frame skeleton
for cur_vehicle in ids:
    time_intervals = aggregate_work_intervals(cur_vehicle)
    sessions = get_velocity_acceleration(cur_vehicle, time_intervals)
    predictions = []
    for i, current_interval in enumerate(time_intervals):  # get predictions
        predictions.append(session_classify(sessions[i]))
    df_seed['time'] = df_seed['time'] + time_intervals
    df_seed['vehicle_id'] = df_seed['vehicle_id'] + [cur_vehicle] * len(predictions)
    df_seed['class'] = df_seed['class'] + [["None", "Calm", "Aggresive"][pred_key] for pred_key in predictions]

prediction_df = pd.DataFrame.from_dict(df_seed)

# Now we have pandas data frame of all sessions with:
* The beginning and the end time of each session
* Vehicle ID
* Predicted driving style

In [88]:
print(prediction_df.head())  # show answer data frame format

                                         time vehicle_id class
0  (2020-01-09 08:19:00, 2020-01-09 08:20:06)         28  None
1  (2020-01-09 10:09:15, 2020-01-09 10:30:36)         28  Calm
2  (2020-01-09 10:49:35, 2020-01-09 10:56:36)         28  Calm
3  (2020-01-09 15:41:38, 2020-01-09 16:26:33)         28  Calm
4  (2020-01-09 16:28:52, 2020-01-09 17:22:33)         28  Calm


In [86]:
print(prediction_df[prediction_df['class'] == 'Aggresive'])  # show all aggresive sessions

                                           time vehicle_id      class
562  (2020-03-26 08:56:10, 2020-03-26 08:56:25)          3  Aggresive


In [87]:
print(len(prediction_df[prediction_df['class'] == 'None']))  # show count of sessions without actually driving

232
