# Includes

In [1]:
import pandas as pd
from sklearn.cluster import KMeans

from matplotlib import pyplot as plt
%matplotlib inline

import os
import re
from datetime import datetime, timedelta
import numpy as np
from math import ceil, floor

# Load data

In [2]:
root_path = ".."
data_dir = os.path.join(root_path, "unpacked")

data_files = os.listdir(data_dir)  # target files
data_files = [x for x in data_files if x != 'readme.txt']  # remove readme


def file_to_df(file):
    """Open file and create pandas data frame"""
    full_path = os.path.join(data_dir, file)
    return pd.read_csv(full_path, sep=';', decimal=',')

# Get IDs

In [4]:
regex_pattern = r"vehicle(\d*)"  # ID is integer number comes right after vehicle word
compiled_pattern = re.compile(regex_pattern)
ids = compiled_pattern.findall(''.join(data_files))  # apply pattern to all file names
ids = set(ids)  # get a set of unique numbers
print(ids)

{'3', '1', '5', '28', '19'}


## Load info about a single vehicle

In [5]:
def open_file_id(v_id, key_lexem):
    """Key lexem determines file, for example, 'fuelLevel'"""
    regex_pattern = re.compile(f"(vehicle{v_id}_{key_lexem}" + r"_(\w|\d|_)*\.csv)")  # pattern to find appropriate name
    pattern_match = regex_pattern.search('|'.join(data_files))  # search on a whole file set
    needed_file = pattern_match.group(1)  # the file is found, now we can open it
    return file_to_df(needed_file)
    
possible_lexems = ['fuelLevel', 'ingection', 'refueling2', 'speedAndHeight', 'tachometer']

def load_all_files_id(v_id):
    """Generates a dictionary of all files describing a single vehicle"""
    return {lex: open_file_id(v_id, lex) for lex in possible_lexems}

In [6]:
raw_data_base = {v_id: load_all_files_id(v_id) for v_id in ids}  # load all data

# Task 6

## Calculate main statistics

In [7]:
def velocity_data(v_id, df_holder=raw_data_base):  # velocity statistics
    v_df = df_holder[v_id]['speedAndHeight']
    mean_v = np.mean(v_df[v_df['SPEED'] > 0]['SPEED'])
    med_v = np.median(v_df[v_df['SPEED'] > 0]['SPEED'])
    max_v = np.max(v_df['SPEED'])
    max_trimmed_v = np.percentile(v_df[v_df['SPEED'] > 0]['SPEED'], q=99)
    return {'mean_v': mean_v, 'med_v': med_v, 'max_v': max_v, 'max_trim_v': max_trimmed_v}

def tachometer_data(v_id, df_holder=raw_data_base):  # tachometer statistics
    t_df = df_holder[v_id]['tachometer']
    mean_t = np.mean(t_df[t_df['BEVALUE'] > 0]['BEVALUE'])
    max_t = np.max(t_df['BEVALUE'])
    quantiles = np.percentile(t_df[t_df['BEVALUE'] > 0]['BEVALUE'], q=[25, 50, 75, 80, 99])
    return {'mean_t': mean_t, 'max_t': max_t, 'quant_t': quantiles}

def fuel_lvl_data(v_id, df_holder=raw_data_base):  # fuel level statistics
    f_df = df_holder[v_id]['fuelLevel']
    mean_f = np.mean(f_df['BEVALUE'])
    max_f = np.max(f_df['BEVALUE'])
    med_f = np.median(f_df['BEVALUE'])
    return {'mean_f': mean_f, 'max_f': max_f, 'med_f': med_f}

## Apply to data

### Show all statistics to highlight classes and choose appropriate classification criterias 

In [8]:
for i in ids:  # show all statistics for each vehicle
    print(f'Vehile {i}')
    print(velocity_data(i))
    print(tachometer_data(i))
    print(fuel_lvl_data(i))

Vehile 3
{'mean_v': 36.843193566915566, 'med_v': 36.0, 'max_v': 227, 'max_trim_v': 91.0}
{'mean_t': 1207.001399906673, 'max_t': 2328, 'quant_t': array([ 896., 1191., 1465., 1519., 1999.])}
{'mean_f': 180.684451612903, 'max_f': 279.8, 'med_f': 187.85000000000002}
Vehile 1
{'mean_v': 56.243554189776646, 'med_v': 57.0, 'max_v': 129, 'max_trim_v': 119.0}
{'mean_t': 2344.0295926301155, 'max_t': 4608, 'quant_t': array([1696., 2432., 3008., 3168., 4288.])}
{'mean_f': 38.86536359814994, 'max_f': 60.0, 'med_f': 40.0}
Vehile 5
{'mean_v': 62.318233826237616, 'med_v': 65.0, 'max_v': 132, 'max_trim_v': 112.0}
{'mean_t': 1910.7419759016018, 'max_t': 4857, 'quant_t': array([1522., 1977., 2401., 2486., 3063.])}
{'mean_f': 40.34714247383985, 'max_f': 73.5, 'med_f': 41.1}
Vehile 28
{'mean_v': 57.01152110509744, 'med_v': 57.0, 'max_v': 122, 'max_trim_v': 108.0}
{'mean_t': 2457.522518800099, 'max_t': 4672, 'quant_t': array([1984., 2560., 3072., 3168., 3872.])}
{'mean_f': 16.823962580954827, 'max_f': 60.0,

## Create a decision tree due to the statistics

### Classify each vehicle 3 times as there are 3 types of classes:

* Slow or fast
* Cargo or 'passenger-oriented'
* Loaded or commonly-used

Loaded means vehicle is used with conditions above standart norms (revs count is too high)

### To classify some statistics are used:

* Maximum velocity (max_v)
* Maximum trimmed velocity (calculate maximum from the subset, max_trim_v)
* Maximum fuel level (max_f)
* Mean fuel level (mean_f)
* Revs 99 percentile

In [9]:
def classify_vehicle(v_id, data_holder=raw_data_base):
    # core classifying function -- decision tree used
    velocity_dict = velocity_data(v_id, data_holder)  # calculate statistics
    tachometer_dict = tachometer_data(v_id, data_holder)
    fuel_dict = fuel_lvl_data(v_id, data_holder)
    
    categories = { 'slow': False, 'cargo': False, 'loaded': False }  # init categories dictionary
    
    # Classify
    # Slow or fast
    if velocity_dict['max_v'] < 80 and velocity_dict['max_trim_v'] < 50:
        categories['slow'] = True
        
    # Cargo or not
    if fuel_dict['max_f'] > 150 and fuel_dict['mean_f'] > 100:
        categories['cargo'] = True
        
    # Normal or loaded
    if tachometer_dict['quant_t'][-1] > 3900:
        categories['loaded'] = True
        
    return categories

def pretty_v_class(vehicle_name, ans_dict):
    # Represent answer as a string
    opposites_dict = { 'slow': 'fast', 'cargo': 'light weight', 'loaded': 'normal' }  # to convert False to string label
    final_words = [x if ans_dict[x] else opposites_dict[x] for x in ans_dict]  # convert True-False answer flags to string labels
    print('The vehicle {} is {}, {} and {}'.format(vehicle_name, final_words[0], final_words[1], final_words[2]))  # print result

In [10]:
def classify_all_vehicles():
    # classify all and show summary
    histograms = {  # histograms are used to show summary
        'fast': 0,
        'slow': 0,
        'cargo': 0,
        'light weight': 0,
        'normal': 0,
        'loaded': 0
    }
    
    total = 0
    opposites_dict = { 'slow': 'fast', 'cargo': 'light weight', 'loaded': 'normal' }
    
    for cur_id in ids:  # classify each vehicle
        v_class = classify_vehicle(cur_id)
        pretty_v_class(cur_id, v_class)
        total += 1
        for cur_class in v_class:  # aggregate histograms
            if v_class[cur_class]:
                histograms[cur_class] += 1
            else:
                histograms[opposites_dict[cur_class]] += 1
    
    print("\nSummary\n")
    for class_name in histograms:  # show summary results
        print(f'{class_name}: {histograms[class_name]} ({histograms[class_name] / total * 100}%)')

## Inferrence

In [11]:
classify_all_vehicles()

The vehicle 3 is fast, cargo and normal
The vehicle 1 is fast, light weight and loaded
The vehicle 5 is fast, light weight and normal
The vehicle 28 is fast, light weight and normal
The vehicle 19 is slow, light weight and normal

Summary

fast: 4 (80.0%)
slow: 1 (20.0%)
cargo: 1 (20.0%)
light weight: 4 (80.0%)
normal: 4 (80.0%)
loaded: 1 (20.0%)
