# Manual process to clean data

### Imports

In [1]:
import numpy as np
import plotly.graph_objects as go
import pandas as pd

#### <span style="color:green">Plot functions</span>

In [4]:
variables_dict = {'speed': 'Velocidad', 'latitude':'Lat', 'longitude':'lng', 'accPosition': 'Presión del acelerador',
                  'accX': 'Aceleración en X', 'accY': 'Aceleración en Y', 'accZ': 'Aceleración en Z',
                  'magX': 'Fuerza magnética en X', 'magY': 'Fuerza magnética en Y', 'magZ': 'Fuerza magnética en Z',
                  'velAngX': 'Velocidad angular en X', 'velAngY': 'Velocidad angular en Y', 'velAngZ': 'Velocidad angular en Z'}
units_dict = {'speed': 'km/h', 'latitude':'°', 'longitude':'°', 'accPosition': '% de presión',
              'accX': 'm/s\u00B2', 'accY': 'm/s\u00B2', 'accZ': 'm/s\u00B2',
              'magX': '\u03BC T', 'magY': '\u03BC T', 'magZ': '\u03BC T',
              'velAngX': 'rad/seg', 'velAngY': 'rad/seg', 'velAngZ': 'rad/seg'}


def update_layout(fig, chart_title: str, **kwargs):
    """Update layout for every chart

    Args:
        fig: A plotly figure
        chart_title (str): A title for chart

    Returns:
        fig: the updated layout for the plotly figure
    """
    # Get kwards data if is needed
    x_title = kwargs.get('xaxis_title', None)
    y_title = kwargs.get('yaxis_title', None)
    height_size = kwargs.get('height', 450)

    # Update layout of the figure
    fig.update_layout(
        title={'text': chart_title, 'x': 0.5},
        legend_title='Tipo de evento',
        xaxis_title=x_title,
        yaxis_title=y_title,
        template='plotly_white',
        autosize=True,
        height=height_size,
        font=dict(
            family="BlinkMacSystemFont,-apple-system,Segoe UI,Roboto,Oxygen,Ubuntu, \
                    Cantarell,Fira Sans,Droid Sans,Helvetica Neue,Helvetica,Arial,sans-serif",
            size=14,
            color="#363636"
        )
    )

    return fig


def line_chart(df, variable):
    """Line chart created with plotly graphic object

    Args:
        df (DataFrame): The dataframe that contains the data to plot
        variable (str): The specific variable to plot

    Returns:
        fig: A plotly figure
    """
    # Create a copy of the data frame to avoid losing important data
    near_crash_df = df.copy()
    no_crash_df = df.copy()
    x = df['timestamp']
    # Filter colums for eventClass
    # Set all variables where have a normal events to NaN
    near_crash_df.loc[df['eventClass'] == 0, variable] = None
    # Set all variables where have a near-crash events to NaN
    no_crash_df.loc[df['eventClass'] == 1, variable] = None

    fig = go.Figure()
    fig.add_trace(go.Scatter(
        x=x,
        y=no_crash_df[variable],
        name='Sin evento',
        marker_color='#00D1B1',
        text=no_crash_df["id"],
        hovertemplate="X = %{x}<br>Y = %{y}<br>ID = %{text}"
    ))
    fig.add_trace(go.Scatter(
        x=x,
        y=near_crash_df[variable],
        name='Near-Crash',
        marker_color='#FF385F',
        text=near_crash_df["id"],
        hovertemplate="X = %{x}<br>Y = %{y}<br>ID = %{text}"
    ))

    fig = update_layout(fig, f'Gráfico de linea de la {variables_dict[variable]}',
                        xaxis_title='Marca de Tiempo', yaxis_title=units_dict[variable])
    fig.update_yaxes(nticks=12)
    fig.update_xaxes(nticks=12, tickangle=45)

    return fig


### Load raw data

In [8]:
csv_name = "./data/smartphone_02-Feb-2022-20-07_frenada repentina_Data-Mv2GkS5qm4edjQ9DFUt.csv"
df = pd.read_csv(csv_name)
df.head(5)

Unnamed: 0,id,accX,accY,accZ,eventClass,idTrip,idVehicle,latitude,longitude,magX,magY,magZ,route,speed,timestamp,velAngX,velAngY,velAngZ
0,34214,-0.927219,2.800484,9.346664,0,10,Nissan March (Xiaomi Redmi Note 9S),0.833369,-77.650155,5.2125,27.093752,20.962502,Frenada repentina,15.384299,2022-02-02 20:07:06.490000-05:00,-0.021571,-0.035818,0.029827
1,34215,0.167198,2.723615,8.980562,0,10,Nissan March (Xiaomi Redmi Note 9S),0.833369,-77.650155,5.11875,26.906252,21.262501,Frenada repentina,15.384299,2022-02-02 20:07:06.538000-05:00,-0.028761,0.026764,0.018375
2,34216,0.227318,3.004472,9.037991,0,10,Nissan March (Xiaomi Redmi Note 9S),0.833369,-77.650155,5.2125,26.681252,21.356251,Frenada repentina,15.384299,2022-02-02 20:07:06.589000-05:00,-0.021305,-0.029827,0.025832
3,34217,-0.554537,2.754422,8.303095,0,10,Nissan March (Xiaomi Redmi Note 9S),0.833369,-77.650155,5.325,26.587502,21.300001,Frenada repentina,15.384299,2022-02-02 20:07:06.638000-05:00,-0.027563,-0.028362,-0.002264
4,34218,0.192323,2.383535,9.022138,0,10,Nissan March (Xiaomi Redmi Note 9S),0.833369,-77.650155,5.38125,26.793751,21.1875,Frenada repentina,15.384299,2022-02-02 20:07:06.689000-05:00,-0.020905,-0.046471,0.001065


## Clean data process

For cleaning, certain drawbacks have been identified to be solved:

### Show raw data

In [9]:
variables = ["speed", "accPosition", "accX", "accY", "accZ", "velAngX", "velAngY", "velAngZ", "magX", "magY", "magZ"]
for var in variables:
    fig = line_chart(df, var)
    fig.show()

RAW DATA


### Eliminar o agregar eventos de near-crash

*Problema:*
Etiquetado incorrecto de un evento de conducción por parte del operador. Provocando un desfase o errores en la marcación de un evento de near-crash.

*Solucion:*
Primero necesitamos encajar los eventos de *near-crash* donde corresponden, eliminando o agregando en la columna ***eventClass*** un <span style="color:red">"1"</span> si corresponde a un evento o un <span style="color:cyan">"0"</span> si no corresponde a este.

In [None]:
min_ID = 7798  # ID to start near-crash
max_ID = 7840  # ID to end near-crash

df["eventClass"] = 0
df.loc[(df["id"] >= min_ID) & (df["id"] <= max_ID), "eventClass"] = 1
print("near crash event")
line_chart(df, "velAngZ")

### Correct offset of kinematic variables

*Issue:*
Offset in the acceleration data due to the presence of inclination in the place where the devices (smartphone or hybrid) were placed.

*Solution:*
Remove the offset by subtracting the offset or offset value from the captured value.

In [None]:
standby = [0, 7660]
var_with_offset = "accY"

offset = df.loc[(df['id'] <= standby[1]) & (df["id"] >= standby[0]), var_with_offset].mean()
print("offset correction: ", offset)

df[var_with_offset] = df[var_with_offset] - offset
line_chart(df, var_with_offset)

### Interpolate some variables

*Issue:*
Affect of the characteristics of the speed and position of the accelerator in the dataset of the hybrid device due to the disconnection of the ELM327 scanner.

*Solution:*
Obtain new points by means of the interpolation method.


<div class='alert alert-block alert-info'>
The interpolation allows us to have a general sampling of the data, obtaining for example more points in the speed variable (Only in the case of the Raspberry for speed).
</div>

In [None]:
# No implemented in smartphone device
max_zone = 5946
min_zone = 5944

var_to_interpolate = "speed"
df.loc[(df['id'] <= max_zone-1) & (df["id"] >=min_zone+1), var_to_interpolate] = None
df[var_to_interpolate].interpolate(method="from_derivatives", inplace=True)
line_chart(df, var_to_interpolate)

### <span style="color:violet">Check final results</span>

In [14]:
variables = ["speed", "accPosition", "accX", "accY", "accZ", "velAngX", "velAngY", "velAngZ", "magX", "magY", "magZ"]

for var in variables:
    fig = line_chart(df, var)
    fig.show()

### Overwrite raw data

In [15]:
df.drop(['active', 'breakPosition'], axis=1, inplace=True)
df.to_csv(f"{csv_name}", index=False)