# Exploratory Data Analysis

In [None]:
from pathlib import Path
from typing import Tuple

from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import pandas as pd
import re, yaml
from scipy.io.arff import loadarff
from scipy.io.arff._arffread import MetaData

In [None]:
# TODO: move all functions on top
# TODO: remove all unnecessary parts that are meant for EDA

In [None]:
DATA_PATH = Path("../data/file65ef3a759daf.arff")

## Data Preprocessing Functions

In [None]:
def load_data(path_: Path) -> Tuple[pd.DataFrame, MetaData]:
    """Loads the .arff file (incl. metadata) and converts to utf-8.

    Parameters
    -------
    path_ : Path
            Path of the data.

    Returns
    -------
    data : pd.DataFrame
            Data as a dataframe.
    meta : scipy.io.arff._arffread.Metadata
            Metadata of the dataset.
    """
    # load df and metadata from .arff
    data, meta = loadarff(path_)
    data = pd.DataFrame(data)

    # remove b string from data
    str_df = data.select_dtypes([object])
    str_df = str_df.stack().str.decode("utf-8").unstack()
    data = pd.concat([str_df, data.select_dtypes(exclude=[object])], axis=1)

    return data, meta

## Preprocess Data

In [None]:
data, meta = load_data(DATA_PATH)

In [None]:
meta

In [None]:
print("Dimensions of the dataset:", data.shape)

In [None]:
data.head()

In [None]:
data["Violation.Type"].value_counts()

In [None]:
data["Race"].value_counts()

In [None]:
data["Color"].value_counts()

In [None]:
data["VehicleType"].value_counts()

#### Merging categories for feature "Make"

We observe that in the feature "Make" many of the categories in this case car brands/ car manufacturers are misspelled or varying words are used for the same brand. Therefore we need to marge these similar words so that they have the same category.  


In [None]:
car_variations = {
    "Toyota": ["TOYOTA", "TOYOTA", "TOYTA", "TOYT", "TOYOYA", "TOTY", "TOYO"],
    "Honda": ["HONDA", "HONNDA", "HNDA", "HON", "HOND", "HONDA 4D"],
    "Hyundai": [
        "HYUNDAI",
        "HUNDAI",
        "HYANDAI",
        "HUNDAII",
        "HYUND",
        "HYUN",
        "HYUNDIA",
        "HYUNVA",
        "HYUANDAI",
        "HYNDIA",
        "HYINDAI",
        "HYUANDAI",
    ],
    "Nissan": [
        "NISSAN",
        "NISSON",
        "NISAN",
        "NISS",
        "NISSAB",
        "NIIS",
        "NISSAM",
        "NISSN",
        "NISSS",
        "NSSIAN",
        "NISSVAL1996",
        "NISSVAL1999",
    ],
    "Ford": ["FORD"],
    "Freightliner": [
        "FRGHT",
        "FRHT",
        "FRIGHTLINER",
        "FREI",
        "FREIGHT",
        "FREIGHTTK",
        "FREIGHTLINRT",
        "FREIGHTLINER",
        "FREIGHTTK",
    ],
    "Mazda": [
        "MAZDA",
        "MAZAD",
        "MAZA",
        "MADZA",
        "MAD",
        "MADZ",
        "MADZDA",
        "MAZVA",
        "MAZVA",
        "MAZS",
        "MAZDVAL2010",
        "MAZDVA",
        "MADZA",
        "MAZDVAL2010",
        "MAZSA",
        "MAZDA",
        "MAZDVA",
        "MAZVAL2010",
        "MAZDA",
        "MAZSA",
        "MAZDA",
        "MAZDA",
        "MAZVAL2010",
        "MAZVA",
        "MAZVAL2010",
        "MAZDA",
        "MAZDVAL2010",
    ],
    "Chrysler": [
        "CHRYSLER",
        "CHRY",
        "CHRSYLER",
        "CHRYS",
        "CHYRSLER",
        "CHRUSLER",
        "CHY",
        "CHYR",
        "CHRYSLTER",
    ],
    "Volkswagen": [
        "VOLKSWAGEN",
        "VOLKS",
        "VW",
        "VOLK",
        "VOLKSWAGON",
        "VOLKVAL2012",
        "VOLVAL2012",
        "VOLKS",
        "VOLKE",
        "VOLKW",
        "VOLKSW",
        "VOLKSWAG",
        "VOLKSWA",
        "VOLKSW",
        "VOLKSWA",
        "VOLKSWAGGON",
        "VOLKSWAGONQ",
        "VOLKSWAGGON",
        "VWAGON",
        "VOLK",
        "VOLVSWAGEN",
        "VOLKSWAGOM",
        "VOLKA",
        "VWOLKS",
        "VOLKSWAGO",
        "VOKS",
        "VOLKS",
        "VWAGON",
    ],
    "Kia": ["KIA", "KARA", "GENEVAL2003", "KYMCO"],
    "Volvo": ["VOLV0", "VOLKS", "VOLVO", "VOLVO TK", "VOLVOT", "VOLVO SW"],
    "Chevrolet": [
        "CHEVROLET",
        "CHEVEROLET",
        "CHEV",
        "CHEVY",
        "CHEVEROLET",
        "CEHVY",
        "CHECROLET",
        "CHE V",
        "CHEVOLRET",
        "CHEVRLET",
        "CHEVROLETE",
        "CHEVR",
        "CHEVROLT",
        "CHEVROLER",
        "CHEVROLERT",
        "CHECEVY",
        "CHECHY",
        "CHECY",
        "CEVEROLET",
        "CEVY",
        "CHEVORLET",
        "CHEVTOLET",
        "CHEVY GEO",
        "CHECROLET",
    ],
    "Infiniti": ["INFINITI", "INFI", "INFINIT", "INFIITI", "INFINIITI"],
    "Lexus": ["LEXUS", "LEXS", "LEXUS4D", "LEXU", "LEXSUS", "LEVUS", "LRXUS"],
    "Acura": ["ACURA", "ACUR", "ACRUA", "ACRURA", "ACCORD", "ACRURA"],
    "Dodge": ["DODGE", "DODGEI", "DOSGE", "DDGE"],
    "BMW": ["BMW", "BWM", "BOMW"],
    "Lincoln": ["LINCOLN", "LINOLN", "LINC", "LINCLN"],
    "Jeep": ["JEEP", "JEEK", "JEEF", "JEEPQ", "JEE"],
    "Jaguar": ["JAGUAR", "JAG", "JAGU"],
    "Porsche": ["PORSCHE", "PORSCHE"],
    "Audi": ["AUDI"],
    "Mitsubishi": [
        "MITSUBISHI",
        "MITSUBIHI",
        "MITSUBIHI",
        "MITS",
        "MITSUBIHI",
        "MITSH",
        "MITSUH",
        "MITSUBASHI",
        "MITSU",
        "MITSUBIHI",
        "MITSB",
        "MISTUBISHI",
        "MISTUBISHI",
        "MITSUBISH",
        "MIST",
        "MITSUBISHU",
        "MITSUBISH",
        "MITSUBISI",
    ],
    "Porsche": ["PORSCHE", "PORSCHE"],
    "Audi": ["AUDI", "AUD"],
    "Mercedes": [
        "MERCEDES",
        "MERZ BENZ",
        "MERCEDEZ",
        "MERCEDESBENZ",
        "MERC BENZ",
        "MERCEDS BENZ",
        "MERCADES",
        "MECEDES",
        "MERCVAL2013",
        "MERCURY",
        "MERCER",
        "MERCZ",
        "MERZ",
        "MERK",
        "MERECEDES",
        "MERCERY",
        "MERCADES",
        "MERCADES",
        "MERCADES",
        "MERCADES",
        "MERCVAL2013",
        "MERCADES",
        "MERCERY",
    ],
    "SAAB": ["SAAB", "SAA"],
    "Cadillac": ["CADILLAC", "CADI", "CADDI", "CADIALLAC", "CADDILLAC"],
    "Lobo": ["LOBO"],
    "Lamborghini": ["LAMBORGHINI", "LAMBO"],
    "Subaru": ["SUBARU", "SUBA", "SUBUARU"],
    "Buick": ["BUICK", "BRUICK", "BUIK", "BUK"],
    "Lotus": ["LOTUS", "LOTU"],
    "Rolls Royce": ["ROLLS ROYCE"],
    "Tesla": ["TESLA", "TESCA"],
    "Range Rover": ["RANGE ROVER", "RANG ROVER", "RANGEROVER"],
    "Mini": ["MINI", "MINN", "MNNI", "MINICOOP", "MINI COOPER"],
    "Land Rover": ["LNDR"],
    "Plymouth": ["PLYMOUTH", "PLYM"],
}

In [None]:
def create_yaml(dictionary: dict, yaml_name: str) -> yaml:
    """creates yaml file from dictionary

    Args:
        dictionary (dict): the dictionary that is converted
        yaml_name (str): the filename of outputed yaml file

    Returns:
        yaml: the outputed yaml file
    """
    # Convert the dictionary to a YAML string
    yaml_string = yaml.dump(dictionary, default_flow_style=False)
    # Write the YAML string to a file
    with open("../config/" + yaml_name, "w") as yaml_file:
        yaml_file.write(yaml_string)

In [None]:
create_yaml(car_variations, "make_dict.yaml")

In [None]:
def read_yaml(path: Path) -> dict:
    """Reads yaml file from given path and returns as dict.

    Parameters
    -------
    path : Path
            Path of the respective yaml file.

    Returns
    -------
    make_match_dictionary : dict
            Yaml file loaded as dict.
    """
    with open(path, "r") as yaml_file:
        make_match_dictionary = yaml.load(yaml_file, Loader=yaml.FullLoader)

    return make_match_dictionary

In [None]:
data["Make_clean"] = data["Make"]

In [None]:
data["Make_clean"] = data["Make"]


def clean_string(s: str) -> str:
    """Cleans the string by converting to lowercase and removing alphabetical values.

    Args:
        s (string): the string that should be changes

    Returns:
        string: converted string t
    """
    # Remove non-alphanumeric characters and convert to lowercase
    s = re.sub(r"[^a-zA-Z0-9\s]", "", s)
    s = s.lower()
    return s


# Function to find the best match
def replace_with_best_match(
    df: pd.DataFrame, column_name: str, choices: dict, threshold: int = 50
) -> pd.DataFrame:
    """Finds best match between value of the dataframe & of dictionary.

    Args:
        df (pd.DataFrame): The input DataFrame where one column should
        be replaced with its best match from the choices dictionary.

        column_name (str): The name of column that should be replaced with it's best match
        choices (dict):  A dicitionary in which the values
        of a row are similar words or choices for matching with the respective value of the column
        threshold (int, optional): The threshold set for
        the similarity score between the column value and the choices. Defaults to 50.
    """

    def get_best_match(value: str) -> str:
        """Finds the best match for a given value within the choices.

        Args:
            value (str): The value to find the best match for.

        Returns:
            str: The best match for the input value.
        """
        if not value or len(value) < 3:  # Skip empty strings and very short strings
            return value, 0

        # Clean the input value
        value = clean_string(value)
        # Clean the choices
        cleaned_choices = [clean_string(choice) for choice in choices]

        # Use fuzz.token_set_ratio for better token matching
        best_match, score = process.extractOne(value, cleaned_choices, scorer=fuzz.token_set_ratio)

        if score >= threshold:
            best_match = best_match
        else:
            best_match = value

        return best_match

    df[column_name] = df[column_name].apply(lambda x: get_best_match(x))

    return df


replace_with_best_match(data, "Make_clean", car_variations)

print(data["Make_clean"])

In [None]:
make_mapping_hard_coded = {
    "VW": ["volkswagen"],
    "ISU": ["isuzu"],
    "ISUZ": ["isuzu"],
    "MERZ BENZ": ["mercedes"],
    "BENZ": ["mercedes"],
    "MBENZ": ["mercedes"],
    "WV": ["volkswagen"],
}

In [None]:
create_yaml(make_mapping_hard_coded, "make_dict_hard.yaml")

In [None]:
data["Make_clean"] = data["Make_clean"].replace(make_mapping_hard_coded)

In [None]:
def replace_with_hard(data: pd.DataFrame, dict_hard: dict, column: str = "Make") -> pd.DataFrame:
    """replaces existing categories based on a hard encoded dictionary

    Args:
        data (pd.DataFrame): The df that should be changed
        dict_hard (dict): The dict that will be used to change the values of a column
        column (str, optional): The column that will be changed. Defaults to "Make".

    Returns:
        pd.DataFrame: The df with the changes to a column is returned
    """
    data[column] = data[column].replace(dict_hard)
    return data

In [None]:
def categorize_top_n(df: pd.DataFrame, column_name: str, n: int) -> pd.DataFrame:
    """Keep top n classes & missing values and set rest of categories as other.

    Args:
        df (pd.DataFrame): The df used
        column_name (str): The name of the column that should be recategorized
        n (_type_): _description_

    Returns:
        pd.DataFrame: returns dataframe with limited number of classes
    """
    # get the value counts for the specified column
    value_counts = df[column_name].value_counts()

    # get top n categories
    top_n_categories = value_counts.index[:n].tolist()
    top_n_categories.append("?")

    # repalce categories not in top n with 'Other'
    df[column_name] = df[column_name].apply(lambda x: x if x in top_n_categories else "Other")

    return df

In [None]:
data = categorize_top_n(data, "Make_clean", 20)

In [None]:
pd.set_option("display.max_rows", 200)

data["Make_clean"].value_counts().head(200)

### Grouping color column

In [None]:
color_groups = {
    "BLACK": "DARK COLORS",
    "SILVER": "LIGHT COLORS",
    "WHITE": "LIGHT COLORS",
    "GRAY": "DARK COLORS",
    "RED": "BOLD COLORS",
    "BLUE": "LIGHT COLORS",
    "GREEN": "BOLD COLORS",
    "GOLD": "BOLD COLORS",
    "BLUE DARK": "DARK COLORS",
    "TAN": "NEUTRAL COLORS",
    "MAROON": "BOLD COLORS",
    "BLUE LIGHT": "LIGHT COLORS",
    "?": "?",
    "BEIGE": "NEUTRAL COLORS",
    "GREEN DK": "BOLD COLORS",
    "GREEN LGT": "BOLD COLORS",
    "BROWN": "NEUTRAL COLORS",
    "YELLOW": "BOLD COLORS",
    "ORANGE": "BOLD COLORS",
    "BRONZE": "BOLD COLORS",
    "PURPLE": "BOLD COLORS",
    "MULTICOLOR": "UNIQUE",
    "CREAM": "UNIQUE",
    "COPPER": "UNIQUE",
    "PINK": "BOLD COLORS",
    "CHROME": "UNIQUE",
    "CAMOUFLAGE": "UNIQUE",
}

In [None]:
# data["Color"] = data["Color"].replace(color_groups)

In [None]:
# need to fix
data = replace_with_hard(data, dict_hard=color_groups, column="Color")

In [None]:
data["Color"].value_counts()

### Grouping VehicleType column

In [None]:
vehicle_type_groups = {
    "02 - Automobile": "Standard Vehicles",
    "03 - Station Wagon": "Standard Vehicles",
    "04 - Limousine": "Standard Vehicles",
    "01 - Motorcycle": "Motorcycles and Bikes",
    "19 - Moped": "Motorcycles and Bikes",
    "26 - Boat Trailer": "Motorcycles and Bikes",
    "05 - Light Duty Truck": "Light and Heavy Duty Trucks",
    "06 - Heavy Duty Truck": "Light and Heavy Duty Trucks",
    "07 - Truck/Road Tractor": "Light and Heavy Duty Trucks",
    "08 - Recreational Vehicle": "Special Purpose and Recreational Vehicles",
    "10 - Transit Bus": "Special Purpose and Recreational Vehicles",
    "12 - School Bus": "Special Purpose and Recreational Vehicles",
    "14 - Ambulance(Non-Emerg)": "Special Purpose and Recreational Vehicles",
    "20 - Commercial Rig": "Special Purpose and Recreational Vehicles",
    "21 - Tandem Trailer": "Special Purpose and Recreational Vehicles",
    "11 - Cross Country Bus": "Special Purpose and Recreational Vehicles",
    "09 - Farm Vehicle": "Special Purpose and Recreational Vehicles",
    "28 - Other": "Other",
    "29 - Unknown": "?",
    "27 - Farm Equipment": "Other",
    "25 - Utility Trailer": "Other",
    "24 - Camper": "Other",
}

In [None]:
data = replace_with_hard(data, dict_hard=vehicle_type_groups, column="VehicleType")

In [None]:
data["VehicleType"].value_counts()