# Exploratory Data Analysis

In [None]:
# TODO: check if "Year" needs to be float or can be converted to int
# TODO: think about how to potentially group values like "Make", "State", etc.
# TODO: think of what columns to drop
# TODO: drop "SERO" for label or maybe (???) transform to citation
# TODO: think of how to sample the data (ATTENTION: year is the year of the car) - not more than 20k observations
# TODO: think of what to do with missing values - if any left
# TODO: think of feature engineering
# TODO: correlations plot
# TODO: distributions plot

In [None]:
from pathlib import Path
from typing import Tuple

import pandas as pd
from scipy.io.arff import loadarff
from scipy.io.arff._arffread import MetaData

In [None]:
DATA_PATH = Path("../data/file65ef3a759daf.arff")

## Data Preprocessing Functions

In [None]:
def load_data(path_: Path) -> Tuple[pd.DataFrame, MetaData]:
    """Loads the .arff file (incl. metadata) and converts to utf-8.

    Parameters
    -------
    path_ : Path
            Path of the data.

    Returns
    -------
    data : pd.DataFrame
            Data as a dataframe.
    meta : scipy.io.arff._arffread.Metadata
            Metadata of the dataset.
    """
    # load df and metadata from .arff
    data, meta = loadarff(path_)
    data = pd.DataFrame(data)

    # remove b string from data
    str_df = data.select_dtypes([object])
    str_df = str_df.stack().str.decode("utf-8").unstack()
    data = pd.concat([str_df, data.select_dtypes(exclude=[object])], axis=1)

    return data, meta


def change_to_bool(df: pd.DataFrame) -> pd.DataFrame:
    """Change yes/no values in columns to bool.

    Parameters
    -------
    df : pd.DataFrame
            Data to transform.

    Returns
    -------
    df : pd.DataFrame
            Transformed data.
    """
    for col in df.columns:
        if set(df[col].unique().tolist()) - set(["No", "Yes"]) == set():
            df[col] = df[col].map(dict(Yes=True, No=False))

    return df

## Preprocess Data

In [None]:
data, meta = load_data(DATA_PATH)

In [None]:
meta

In [None]:
print("Dimensions of the dataset:", data.shape)

In [None]:
data.head()

In [None]:
data["Violation.Type"].value_counts()

In [None]:
data["Race"].value_counts()

In [None]:
data["Violation.Type"].value_counts()

In [None]:
data = change_to_bool(data)
data.head()