# Exploratory Data Analysis

In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [None]:
OUT_PATH = Path("../data") / "data.csv"
CAT_COLS = ["Color", "Gender", "Make", "Race", "VehicleType"]

In [None]:
df = pd.read_csv(OUT_PATH)

In [None]:
# check dimensions
df.shape

In [None]:
# look at the head
df.head()

In [None]:
df_numeric = df.select_dtypes(exclude=[object])
# shows correlation of numeric features
corr = df_numeric.corr()
corr

In [None]:
# creates heatmap of correlation of numeric features
sns.heatmap(corr)
plt.title("Correlation plot of numerical features");

In [None]:
# creates bar plot to show distribution of categories
for col in df.columns:
    if col != "Year":
        plt.figure()
        ax = round(df[col].value_counts() / df.shape[0], 2).sort_values().plot(kind="bar")

        for container in ax.containers:
            ax.bar_label(container)
            ax.set_title("Share of groups")

In [None]:
df.Year.value_counts().sort_index().plot(kind="bar")
plt.title("Share of values per Year");

In [None]:
def dummy_encoder(
    df: pd.DataFrame = None,
    cols: list = None,
) -> pd.DataFrame:
    """Encode the categorical variables within the dataframe.

    Parameters
    -------
    df : pd.DataFrame
            Dataframe to split.
    cols : list
            List with the categorical variables we wish to dummy encode.

    Returns
    -------
    df : pd.DataFrame
        Dataframe with categorical variables one-hot-encoded(original df if catboost=True).
    """
    df = pd.get_dummies(df, cols, dtype=int)
    return df

In [None]:
data_dummy = dummy_encoder(df)

In [None]:
feature_cols = list(data_dummy.columns)
feature_cols.remove("Citation")
data_dummy = data_dummy[["Citation"] + feature_cols]

In [None]:
data_dummy.shape

In [None]:
data_dummy.head()

In [None]:
corr = data_dummy.corr()

In [None]:
# creates heatmap of correlation of all features
sns.heatmap(corr)
plt.title("Correlation plot of all features");