# Exploratory Data Analysis

In [None]:
# TODO: check if "Year" needs to be float or can be converted to int --> Done
# TODO: think about how to potentially group values like "Make", "State", etc.
# TODO: think of what columns to drop
# TODO: drop "SERO" for label or maybe (???) transform to citation
# TODO: think of how to sample the data (ATTENTION: year is the year of the car) - not more than 20k observations
# TODO: think of what to do with missing values - if any left
# TODO: think of feature engineering
# TODO: correlations plot
# TODO: distributions plot

In [None]:
from pathlib import Path
from typing import Tuple

from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import pandas as pd
import re, yaml
from scipy.io.arff import loadarff
from scipy.io.arff._arffread import MetaData

In [None]:
DATA_PATH = Path("../data/file65ef3a759daf.arff")

## Data Preprocessing Functions

In [None]:
def load_data(path_: Path) -> Tuple[pd.DataFrame, MetaData]:
    """Loads the .arff file (incl. metadata) and converts to utf-8.

    Parameters
    -------
    path_ : Path
            Path of the data.

    Returns
    -------
    data : pd.DataFrame
            Data as a dataframe.
    meta : scipy.io.arff._arffread.Metadata
            Metadata of the dataset.
    """
    # load df and metadata from .arff
    data, meta = loadarff(path_)
    data = pd.DataFrame(data)

    # remove b string from data
    str_df = data.select_dtypes([object])
    str_df = str_df.stack().str.decode("utf-8").unstack()
    data = pd.concat([str_df, data.select_dtypes(exclude=[object])], axis=1)

    return data, meta


def change_to_bool(df: pd.DataFrame) -> pd.DataFrame:
    """Change yes/no values in columns to bool.

    Parameters
    -------
    df : pd.DataFrame
            Data to transform.

    Returns
    -------
    df : pd.DataFrame
            Transformed data.
    """
    for col in df.columns:
        if set(df[col].unique().tolist()) - set(["No", "Yes"]) == set():
            df[col] = df[col].map(dict(Yes=True, No=False))

    return df

## Preprocess Data

In [None]:
data, meta = load_data(DATA_PATH)

In [None]:
meta

In [None]:
print("Dimensions of the dataset:", data.shape)

In [None]:
data.head()

In [None]:
data["Violation.Type"].value_counts()

In [None]:
data["Race"].value_counts()

In [None]:
data = change_to_bool(data)
data.head()

Check whether Year can be converted in int

In [None]:
def convert_float_to_int(df: pd.DataFrame, column_name: str) -> pd.DataFrame:
    """If possible to convert float to int converts to int.

    Args:
        df (pd.DataFrame): df of which column should be converted
        column_name (str): column name that should be converted to int
    """
    if df[column_name].dropna().apply(lambda x: x.is_integer()).all():
        df[column_name] = df[column_name].fillna(-1).astype(int)
    else:
        print("Can't be converted to int")

    return df

In [None]:
convert_float_to_int(data, "Year")

In [None]:
data["Year"]

#### Merging categories for feature "Make"

We observe that in the feature "Make" many of the categories in this case car brands/ car manufacturers are misspelled or varying words are used for the same brand. Therefore we need to marge these similar words so that they have the same category.  


In [None]:
car_variations = {
    "Toyota": ["TOYOTA", "TOYOTA", "TOYTA", "TOYT", "TOYOYA", "TOTY", "TOYO"],
    "Honda": ["HONDA", "HONNDA", "HNDA", "HON", "HOND", "HONDA 4D"],
    "Hyundai": [
        "HYUNDAI",
        "HUNDAI",
        "HYANDAI",
        "HUNDAII",
        "HYUND",
        "HYUN",
        "HYUNDIA",
        "HYUNVA",
        "HYUANDAI",
        "HYNDIA",
        "HYINDAI",
        "HYUANDAI",
    ],
    "Nissan": [
        "NISSAN",
        "NISSON",
        "NISAN",
        "NISS",
        "NISSAB",
        "NIIS",
        "NISSAM",
        "NISSN",
        "NISSS",
        "NSSIAN",
        "NISSVAL1996",
        "NISSVAL1999",
    ],
    "Ford": ["FORD"],
    "Freightliner": [
        "FRGHT",
        "FRHT",
        "FRIGHTLINER",
        "FREI",
        "FREIGHT",
        "FREIGHTTK",
        "FREIGHTLINRT",
        "FREIGHTLINER",
        "FREIGHTTK",
    ],
    "Mazda": [
        "MAZDA",
        "MAZAD",
        "MAZA",
        "MADZA",
        "MAD",
        "MADZ",
        "MADZDA",
        "MAZVA",
        "MAZVA",
        "MAZS",
        "MAZDVAL2010",
        "MAZDVA",
        "MADZA",
        "MAZDVAL2010",
        "MAZSA",
        "MAZDA",
        "MAZDVA",
        "MAZVAL2010",
        "MAZDA",
        "MAZSA",
        "MAZDA",
        "MAZDA",
        "MAZVAL2010",
        "MAZVA",
        "MAZVAL2010",
        "MAZDA",
        "MAZDVAL2010",
    ],
    "Chrysler": [
        "CHRYSLER",
        "CHRY",
        "CHRSYLER",
        "CHRYS",
        "CHYRSLER",
        "CHRUSLER",
        "CHY",
        "CHYR",
        "CHRYSLTER",
    ],
    "Volkswagen": [
        "VOLKSWAGEN",
        "VOLKS",
        "VW",
        "VOLK",
        "VOLKSWAGON",
        "VOLKVAL2012",
        "VOLVAL2012",
        "VOLKS",
        "VOLKE",
        "VOLKW",
        "VOLKSW",
        "VOLKSWAG",
        "VOLKSWA",
        "VOLKSW",
        "VOLKSWA",
        "VOLKSWAGGON",
        "VOLKSWAGONQ",
        "VOLKSWAGGON",
        "VWAGON",
        "VOLK",
        "VOLVSWAGEN",
        "VOLKSWAGOM",
        "VOLKA",
        "VWOLKS",
        "VOLKSWAGO",
        "VOKS",
        "VOLKS",
        "VWAGON",
    ],
    "Kia": ["KIA", "KARA", "GENEVAL2003", "KYMCO"],
    "Volvo": ["VOLV0", "VOLKS", "VOLVO", "VOLVO TK", "VOLVOT", "VOLVO SW"],
    "Chevrolet": [
        "CHEVROLET",
        "CHEVEROLET",
        "CHEV",
        "CHEVY",
        "CHEVEROLET",
        "CEHVY",
        "CHECROLET",
        "CHE V",
        "CHEVOLRET",
        "CHEVRLET",
        "CHEVROLETE",
        "CHEVR",
        "CHEVROLT",
        "CHEVROLER",
        "CHEVROLERT",
        "CHECEVY",
        "CHECHY",
        "CHECY",
        "CEVEROLET",
        "CEVY",
        "CHEVORLET",
        "CHEVTOLET",
        "CHEVY GEO",
        "CHECROLET",
    ],
    "Infiniti": ["INFINITI", "INFI", "INFINIT", "INFIITI", "INFINIITI"],
    "Lexus": ["LEXUS", "LEXS", "LEXUS4D", "LEXU", "LEXSUS", "LEVUS", "LRXUS"],
    "Acura": ["ACURA", "ACUR", "ACRUA", "ACRURA", "ACCORD", "ACRURA"],
    "Dodge": ["DODGE", "DODGEI", "DOSGE", "DDGE"],
    "BMW": ["BMW", "BWM", "BOMW"],
    "Lincoln": ["LINCOLN", "LINOLN", "LINC", "LINCLN"],
    "Jeep": ["JEEP", "JEEK", "JEEF", "JEEPQ", "JEE"],
    "Jaguar": ["JAGUAR", "JAG", "JAGU"],
    "Porsche": ["PORSCHE", "PORSCHE"],
    "Audi": ["AUDI"],
    "Mitsubishi": [
        "MITSUBISHI",
        "MITSUBIHI",
        "MITSUBIHI",
        "MITS",
        "MITSUBIHI",
        "MITSH",
        "MITSUH",
        "MITSUBASHI",
        "MITSU",
        "MITSUBIHI",
        "MITSB",
        "MISTUBISHI",
        "MISTUBISHI",
        "MITSUBISH",
        "MIST",
        "MITSUBISHU",
        "MITSUBISH",
        "MITSUBISI",
    ],
    "Porsche": ["PORSCHE", "PORSCHE"],
    "Audi": ["AUDI", "AUD"],
    "Mercedes": [
        "MERCEDES",
        "MERZ BENZ",
        "MERCEDEZ",
        "MERCEDESBENZ",
        "MERC BENZ",
        "MERCEDS BENZ",
        "MERCADES",
        "MECEDES",
        "MERCVAL2013",
        "MERCURY",
        "MERCER",
        "MERCZ",
        "MERZ",
        "MERK",
        "MERECEDES",
        "MERCERY",
        "MERCADES",
        "MERCADES",
        "MERCADES",
        "MERCADES",
        "MERCVAL2013",
        "MERCADES",
        "MERCERY",
    ],
    "SAAB": ["SAAB", "SAA"],
    "Cadillac": ["CADILLAC", "CADI", "CADDI", "CADIALLAC", "CADDILLAC"],
    "Lobo": ["LOBO"],
    "Lamborghini": ["LAMBORGHINI", "LAMBO"],
    "Subaru": ["SUBARU", "SUBA", "SUBUARU"],
    "Buick": ["BUICK", "BRUICK", "BUIK", "BUK"],
    "Lotus": ["LOTUS", "LOTU"],
    "Rolls Royce": ["ROLLS ROYCE"],
    "Tesla": ["TESLA", "TESCA"],
    "Range Rover": ["RANGE ROVER", "RANG ROVER", "RANGEROVER"],
    "Mini": ["MINI", "MINN", "MNNI", "MINICOOP", "MINI COOPER"],
    "Land Rover": ["LNDR"],
    "Plymouth": ["PLYMOUTH", "PLYM"],
}

In [None]:
def create_yaml(dictionary: dict, yaml_name: str) -> yaml:
    """creates yaml file from dictionary

    Args:
        dictionary (dict): the dictionary that is converted
        yaml_name (str): the filename of outputed yaml file

    Returns:
        yaml: the outputed yaml file
    """
    # Convert the dictionary to a YAML string
    yaml_string = yaml.dump(dictionary, default_flow_style=False)
    # Write the YAML string to a file
    with open("../config/" + yaml_name, "w") as yaml_file:
        yaml_file.write(yaml_string)

In [None]:
create_yaml(car_variations, "make_dict.yaml")

In [None]:
def read_yaml(path: Path) -> dict:
    """reads yaml file from given path and returns as dict

    Args:
        path (Path): the path of the respective yaml file

    Returns:
        dict: the yaml file reconverted to a dict
    """
    with open(path, "r") as yaml_file:
        make_match_dictionary = yaml.load(yaml_file, Loader=yaml.FullLoader)

    return make_match_dictionary

In [None]:
data["Make_clean"] = data["Make"]

In [None]:
data["Make_clean"] = data["Make"]


def clean_string(s: str) -> str:
    """Cleans the string by converting to lowercase and removing alphabetical values.

    Args:
        s (string): the string that should be changes

    Returns:
        string: converted string t
    """
    # Remove non-alphanumeric characters and convert to lowercase
    s = re.sub(r"[^a-zA-Z0-9\s]", "", s)
    s = s.lower()
    return s


# Function to find the best match
def replace_with_best_match(
    df: pd.DataFrame, column_name: str, choices: dict, threshold: int = 50
) -> pd.DataFrame:
    """Finds best match between value of the dataframe & of dictionary.

    Args:
        df (pd.DataFrame): The input DataFrame where one column should
        be replaced with its best match from the choices dictionary.

        column_name (str): The name of column that should be replaced with it's best match
        choices (dict):  A dicitionary in which the values
        of a row are similar words or choices for matching with the respective value of the column
        threshold (int, optional): The threshold set for
        the similarity score between the column value and the choices. Defaults to 50.
    """

    def get_best_match(value: str) -> str:
        """Finds the best match for a given value within the choices.

        Args:
            value (str): The value to find the best match for.

        Returns:
            str: The best match for the input value.
        """
        if not value or len(value) < 3:  # Skip empty strings and very short strings
            return value, 0

        # Clean the input value
        value = clean_string(value)
        # Clean the choices
        cleaned_choices = [clean_string(choice) for choice in choices]

        # Use fuzz.token_set_ratio for better token matching
        best_match, score = process.extractOne(value, cleaned_choices, scorer=fuzz.token_set_ratio)

        if score >= threshold:
            best_match = best_match
        else:
            best_match = value

        return best_match

    df[column_name] = df[column_name].apply(lambda x: get_best_match(x))

    return df


replace_with_best_match(data, "Make_clean", car_variations)

print(data["Make_clean"])

In [None]:
make_mapping_hard_coded = {
    "VW": ["volkswagen"],
    "ISU": ["isuzu"],
    "ISUZ": ["isuzu"],
    "MERZ BENZ": ["mercedes"],
    "BENZ": ["mercedes"],
    "MBENZ": ["mercedes"],
    "WV": ["volkswagen"],
}

In [None]:
create_yaml(make_mapping_hard_coded, "make_dict_hard.yaml")

In [None]:
data["Make_clean"] = data["Make_clean"].replace(make_mapping_hard_coded)

In [None]:
def replace_with_hard(data: pd.DataFrame, dict_hard: dict, column: str = "Make") -> pd.DataFrame:
    data[column] = data[column].replace(make_mapping_hard_coded)

In [None]:
def categorize_top_n(df: pd.DataFrame, column_name: str, n: int) -> pd.DataFrame:
    """Keep top n classes & missing values and set rest of categories as other.

    Args:
        df (pd.DataFrame): The df used
        column_name (str): The name of the column that should be recategorized
        n (_type_): _description_

    Returns:
        pd.DataFrame: returns dataframe with limited number of classes
    """
    # get the value counts for the specified column
    value_counts = df[column_name].value_counts()

    # get top n categories
    top_n_categories = value_counts.index[:n].tolist()
    top_n_categories.append("?")

    # repalce categories not in top n with 'Other'
    df[column_name] = df[column_name].apply(lambda x: x if x in top_n_categories else "Other")

    return df

In [None]:
data = categorize_top_n(data, "Make_clean", 20)

In [None]:
pd.set_option("display.max_rows", 200)

data["Make_clean"].value_counts().head(200)