# Exploratory Data Analysis

In [None]:
from pathlib import Path
from typing import Tuple

import pandas as pd
from scipy.io.arff import loadarff
from scipy.io.arff._arffread import MetaData

In [None]:
DATA_PATH = Path("../data/file65ef3a759daf.arff")

In [None]:
def load_data(path_: Path) -> Tuple[pd.DataFrame, MetaData]:
    """Loads the .arff file (incl. metadata) and converts to utf-8.

    Parameters
    -------
    path_ : Path
            Path of the data.

    Returns
    -------
    data : pd.DataFrame
            Data as a dataframe.
    meta : scipy.io.arff._arffread.Metadata
            Metadata of the dataset.
    """
    # load df and metadata from .arff
    data, meta = loadarff(path_)
    data = pd.DataFrame(data)

    # remove b string from data
    str_df = data.select_dtypes([object])
    str_df = str_df.stack().str.decode("utf-8").unstack()
    data = pd.concat([str_df, data.select_dtypes(exclude=[object])], axis=1)

    return data, meta

In [None]:
data, meta = load_data(DATA_PATH)

In [None]:
meta

In [None]:
print("Dimensions of the dataset:", data.shape)

In [None]:
data.head()

In [None]:
data["Violation.Type"].value_counts()

In [None]:
data["Race"].value_counts()

In [None]:
data["Violation.Type"].value_counts()