## Загружаем методы для поиска аномалий из anomalies.py, методы для слгаживания из smoothing.py, метода для оценки алгоритма из metrics.py, класс для препроцесса данных из  models.py и методы для доставания отрезков сливов и заправок drain_fuel.py

In [1]:
from models import PreprocessModels
from smoothing import exponential_smoothing, double_exponential_smoothing
from anomalies import segments, detect_anomalies, upper_anomalies, lower_anomalies
from metrics import segment_accuracy
from drain_fuel import segments_drain_fuel, detect_segment

## Методы для визуализации

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
sns.set()
from dateutil.parser import parse
import numpy as np
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly
import plotly.graph_objs as go

init_notebook_mode(connected=True)

## Методы для загрузки данных

In [3]:
import os
import re
from datetime import datetime, timedelta
import numpy as np
from math import ceil, floor
import umap
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression

## Остальные методы

In [4]:
import warnings                                  # `do not disturbe` mode
warnings.filterwarnings('ignore')

import numpy as np                               # vectors and matrices
import pandas as pd                              # tables and data manipulations
import matplotlib.pyplot as plt                  # plots
import seaborn as sns                            # more plots

from dateutil.relativedelta import relativedelta # working with dates with style
from scipy.optimize import minimize              # for function minimization

import statsmodels.formula.api as smf            # statistics and econometrics
import statsmodels.tsa.api as smt
import statsmodels.api as sm
import scipy.stats as scs

from itertools import product                    # some useful functions
from tqdm import tqdm_notebook



%matplotlib inline

## Разные метрики для поиска аномалий

In [5]:
from sklearn.metrics import r2_score, median_absolute_error, mean_absolute_error
from sklearn.metrics import median_absolute_error, mean_squared_error, mean_squared_log_error

def mean_absolute_percentage_error(y_true, y_pred): 
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

## Load data

In [18]:
root_path = ""
data_dir = os.path.join(root_path, "unpacked")

data_files = os.listdir(data_dir)  # target files
data_files = [x for x in data_files if x != 'readme.txt']  # remove readme


def file_to_df(file):
    """Open file and create pandas data frame"""
    full_path = os.path.join(data_dir, file)
    return pd.read_csv(full_path, sep=';')

## Get vehicle IDs

In [52]:
regex_pattern = r"vehicle(\d*)"  # ID is integer number comes right after vehicle word
compiled_pattern = re.compile(regex_pattern)
ids = compiled_pattern.findall(''.join(data_files))  # apply pattern to all file names
ids = set(ids)  # get a set of unique numbers
print(ids)

{'28', '1', '3', '5', '19'}


## Load info about a single vehicle

In [53]:
def open_file_id(v_id, key_lexem):
    """Key lexem determines file, for example, 'fuelLevel'"""
    regex_pattern = re.compile(f"(vehicle{v_id}_{key_lexem}" + r"_(\w|\d|_)*\.csv)")  # pattern to find appropriate name
    pattern_match = regex_pattern.search('|'.join(data_files))  # search on a whole file set
    needed_file = pattern_match.group(1)  # the file is found, now we can open it
    return file_to_df(needed_file)
    
possible_lexems = ['fuelLevel', 'ingection', 'refueling2', 'speedAndHeight', 'tachometer']

In [54]:
def load_all_files_id(v_id):
    """Generates a dictionary of all files describing a single vehicle"""
    return {lex: open_file_id(v_id, lex) for lex in possible_lexems}

# Task 1 Find refuelings with 5% accuracy

## Загрузга и препроцесс данных

In [76]:
raw_data_base = {v_id: load_all_files_id(v_id) for v_id in ids} 

In [77]:
prep_models = []
for i in ids:
    prep_models.append(PreprocessModels(raw_data_base[i]['fuelLevel'],raw_data_base[i]['ingection'],raw_data_base[i]['refueling2'],raw_data_base[i]['speedAndHeight'],raw_data_base[i]['tachometer']))

In [78]:
list(map(lambda x: x.preprocess_all_df(), prep_models))

[None, None, None, None, None]

## Отрисовка всех машин

In [73]:
def plotGraphics(axes, series, start_vec, end_vec, type_of_data, window=1):
 
    drain, refueling = segments_drain_fuel(series, start_vec, end_vec, type_of_data, window)
    
    axes.add_trace(go.Scatter(
        x=series.index,
        y=series.BEVALUE,
        mode='lines',
        name='oil'
    ))
    
    axes.add_trace(go.Scatter(
        x=drain.index,
        y=drain.BEVALUE,
        mode='lines',
        name='drain'
    ))
    
    axes.add_trace(go.Scatter(
        x=refueling.index,
        y=refueling.BEVALUE,
        mode='lines',
        name='refueling'
    ))

In [1]:
# for i in prep_models:
#     fig = go.Figure()
#     plotGraphics(fig, i.df1, i.df3['STARTDATE'], i.df3['ENDDATE'], i.df3['bay/drain'])
#     fig.show()

## Идея заключается в нахождение верхних и нижних аномалий соединять их и считать их за заправки автомобиля

## Отрисовка граффиков с найдеными аномалиями и с окном = 2(Окно n- это сумма первого и n-1 элемента деленные на n)

In [80]:
def plotMovingAverage(fig, series, check_series, window, plot_intervals=False, scale=1.96, plot_anomalies=False):

    """
        series - dataframe with timeseries
        window - rolling window size 
        plot_intervals - show confidence intervals
        plot_anomalies - show anomalies 

    """
    
    rolling_mean = series.rolling(window=window).mean().dropna()
    
    
    if plot_intervals:
        
        upper_lower_bond, upper_upper_bond = segments(series,scale,window)
        lower_lower_bond, lower_upper_bond = segments(series[::-1],scale,window)
        
        fig.add_trace(go.Scatter(x=upper_upper_bond.index, y=upper_upper_bond.BEVALUE, name='Upper Bond',mode='lines',
                         line = dict(color='firebrick', width=1, dash='dot')))
        fig.add_trace(go.Scatter(x=upper_lower_bond.index, y=upper_lower_bond.BEVALUE, name='Lower Bond',mode='lines',
                         line = dict(color='firebrick', width=1, dash='dot')))
        
        if plot_anomalies:
            
            upper_anomalies = detect_anomalies(series, upper_lower_bond, upper_upper_bond, window)
            lower_anomalies = detect_anomalies(series[::-1], lower_lower_bond, lower_upper_bond, window)

            fig.add_trace(go.Scatter(x=upper_anomalies.index, y=upper_anomalies.BEVALUE,
                    mode='markers', name='upper_markers', marker_size=10, marker_color='rgba(152, 0, 0, .8)'))
            fig.add_trace(go.Scatter(x=lower_anomalies.index, y=lower_anomalies.BEVALUE,
                    mode='markers', name='lower_markers', marker_size=10, marker_color='rgba(152, 0, 0, .8)'))
    
    
    plotGraphics(fig, series[window:], check_series['STARTDATE'], check_series['ENDDATE'], check_series['bay/drain'])
    

In [2]:
# for i in prep_models:
#     fig = go.Figure()
#     plotMovingAverage(fig, i.df1, i.df3, 2,plot_intervals=True,plot_anomalies=True)
#     fig.show()

## Как видно, для каких-то графиков находит точки аномалии хорошо, а для каких-то нет, давайте дабавим сглаживания y^t=α⋅yt+(1−α)⋅y^t−1

In [82]:
def plotExponentialSmoothing(fig, series, df3, alpha, window=1, scale=1.96):
    """
        Plots exponential smoothing with different alphas
        
        series - dataset with timestamps
        alphas - list of floats, smoothing parameters
        
    """
    plotMovingAverage(fig, exponential_smoothing(series, alpha), df3, window,plot_intervals=True,plot_anomalies=True)

In [3]:
# for i in prep_models:
#     fig = go.Figure()
#     plotExponentialSmoothing(fig, i.df1.BEVALUE, i.df3, alpha=0.8, window=3)
#     fig.show()

## К сожаленнию, но у саммого обычного сглаживание есть минус в том, что все данные смещаются в сторону, следовательно надо как-то потом определять время, не понятно как
## Давайте добавим возможность менять тренд графиков, чтобы аномалии можно было искать еще проще
## ℓx=αyx+(1−α)(ℓx−1+bx−1)
## bx=β(ℓx−ℓx−1)+(1−β)bx−1
## y^x+1=ℓx+bx

In [85]:
def plotDoubleExponentialSmoothing(fig, series, df3, alpha, beta, window=1, scale=1.96):
    """
        Plots double exponential smoothing with different alphas and betas
        
        series - dataset with timestamps
        alphas - list of floats, smoothing parameters for level
        betas - list of floats, smoothing parameters for trend
    """
    plotMovingAverage(fig, double_exponential_smoothing(series, alpha, beta), df3, window, plot_intervals=True, plot_anomalies=True, scale=scale)

In [4]:
# for i in prep_models:
#     fig = go.Figure()
#     plotDoubleExponentialSmoothing(fig, i.df1.BEVALUE, i.df3, alpha=1, beta=0.3, window=2, scale=8)
#     fig.show()

## Стало в некоторых местах получще

## Теперь можно заняться подборкой параметров и отрисовкой графиков, для понимания, что получилось

In [126]:
def plotMovingAverageWithPredict(fig, series, df3, predict_series):
    
    plotGraphics(fig, series, df3['STARTDATE'], df3['ENDDATE'], df3['bay/drain'])
    
    fig.add_trace(go.Scatter(
        x=predict_series.index,
        y=predict_series.BEVALUE,
        mode='lines',
        name='predict_fuel'
    ))
    

In [124]:
for i in prep_models:
    
    upp = upper_anomalies(i.df1.BEVALUE, 1, 0,  2, 6)
    low = lower_anomalies(i.df1.BEVALUE, 1, 0,  2, 6)
    drain, refueling = segments_drain_fuel(i.df1, i.df3['STARTDATE'], i.df3['ENDDATE'], i.df3['bay/drain'])
    try:
        vec = detect_segment(low,upp,i.df1)
        print(segment_accuracy(vec, refueling))
    except:
        pass

0.9583333333333334
0.41025641025641024
0.6
0.36363636363636365


## Давайте визуализируем, что получилось

In [5]:
# for i in prep_models:
#     fig = go.Figure()
#     upp = upper_anomalies(i.df1.BEVALUE, 1, 0,  2, 6)
#     low = lower_anomalies(i.df1.BEVALUE, 1, 0,  2, 6)
#     drain, refueling = segments_drain_fuel(i.df1, i.df3['STARTDATE'], i.df3['ENDDATE'], i.df3['bay/drain'])
#     try:
#         vec = detect_segment(low,upp,i.df1)
#         fig = go.Figure()
#         plotMovingAverageWithPredict(fig, i.df1, i.df3, vec)
#         fig.show()
#     except:
#         pass


## Как видно, получилось, что давольно хорошо определяет, если поиграться с параметрами можно получить намного более хороший результат