# Data Processing Pipeline for beautofuel

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import geopandas as gpd
from datetime import datetime, timedelta
import geopy.distance

from lib.packages.envirocar import TrackAPI, BboxSelector, TimeSelector, ECConfig
from lib.packages.eda_quality import correction as correct
from lib.packages.eda_quality import manipulation as manipulate
from lib.packages.eda_quality import inspection as inspect
from lib.packages.vehicle_eco_balance import get_interval_time

## Pipeline configuration

In [None]:
# How many days behind to look for tracks
days_to_look_back=120

# User
config = ECConfig(username="samko", password="someamazingtotallyrandomtoken")

track_api = TrackAPI()

## Tracks fetching

In [None]:
# Boundary box for Slovakia
bbox = BboxSelector([
    16.76425013529685, # min_x
    47.37325224412486, # min_y
    22.594816079401987, # max_x
    49.7297265173567 # max_y
])

# Tracks time interval
datetime_format = "%Y-%m-%dT%H:%M:%S+00:00"
start_time = datetime.today() - timedelta(days=days_to_look_back)
end_time = datetime.now()
time_interval = TimeSelector(start_time=start_time.strftime(datetime_format), end_time=end_time.strftime(datetime_format))

# Fetch from enviroCar API
tracks_df = track_api.get_tracks(bbox=bbox, time_interval=time_interval)

tracks_df.plot()

## Data cleaning and pre-processing

In [None]:
# Drop duplicated rows
tracks_df = correct.drop_duplicates(tracks_df)

# Remove tracks that exceed 8 hours of duration time
_, tracks_df, _ = correct.exceed_eight_hours(tracks_df, flag=False)

# Remove tracks that falls below x minutes of duration time
_, tracks_df, _ = correct.below_x_min(tracks_df, x=3, flag=False)

# Remove tracks that exceed 250 km/h speed
_, tracks_df, _ = correct.implausible_max_speed(tracks_df, flag=False)

# Drop unit colums since we are not interested in them
manipulate.drop_unit_columns(tracks_df).head()

track_ids = tracks_df['track.id'].unique()