### Imports

In [None]:
import pandas as pd

**NOTE:** The first line of the first code cell below contains the magic command `%%writefile`, which saves the content of the cell into the specified file. Right now, all such commands are commented. Please make sure to do the following steps after you have completed all the subtasks:

1) Uncomment **all** `%%writefile` commands
2) Re-run the whole notebook

This is important, as we will check the generated `.py` files.

### Task 1: Parse and cast a given attribute

Your task is to implement a function that parses a specific column within a given dataframe. Please use the template `parse_numeric_column` for the implementation.

In [None]:
%%writefile ./../solutions/parsing.py
import string
import pandas as pd


def parse_numeric_column(df: pd.DataFrame, target_columns: str) -> pd.DataFrame:
    df[target_columns] = df[target_columns].apply(lambda x: ''.join(c for c in str(x) if c.isdigit() or c == '.' or c == '-') or '0')
    df[target_columns] = df[target_columns].astype('float32')
    return df

In [None]:
# Tests

# Test
df = pd.DataFrame({
    "a": [1, 2, 3, 4, 5, 6, 7, 8],
    "b": [1, 2, "aSf23qf", "dav1.4143", "23.23dVw", "2.42", 3.1, "f233f"],
})

result_df = parse_numeric_column(df, "b")
expected_df = pd.DataFrame({
    "a": [1, 2, 3, 4, 5, 6, 7, 8],
    "b": [1, 2, 23, 1.4143, 23.23, 2.42, 3.1, 233],
})
expected_df["b"] = expected_df["b"].astype("float32")

pd.testing.assert_frame_equal(
    result_df,
    expected_df
)

### Task 2: Encoding categorical attributes

Your task is to implement a function that encodes categorical attributes in a given dataframe according to given parameters. Please use the template `encode_categorical_attributes` for the implementation.

In [None]:
%%writefile ./../solutions/encoding.py
from typing import Dict, Union
from dataclasses import dataclass
import pandas as pd

@dataclass
class LabelEncodingParameters:
    # Defines a mapping that should be used for label encoding
    mapping: Dict[str, int]
    remove_original_attribute: bool = False


@dataclass
class OneHotEncodingParameters:
    # The refix that should be used for one-hot encoding so that resulting column names are in the following format:
    # {prefix}_{value}
    prefix: str
    remove_original_attribute: bool = False


def encode_categorical_attributes(df: pd.DataFrame, parameters: Dict[str, Union[LabelEncodingParameters, OneHotEncodingParameters]]) -> pd.DataFrame:
    for column, param in parameters.items():
        if isinstance(param, LabelEncodingParameters):
            df[column + '_label_encoding'] = df[column].map(param.mapping)
            if param.remove_original_attribute:
                df = df.drop(column, axis=1)
        elif isinstance(param, OneHotEncodingParameters):
            one_hot = pd.get_dummies(df[column], prefix=param.prefix).astype(int)
            df = pd.concat([df, one_hot], axis=1)
            if param.remove_original_attribute:
                df = df.drop(column, axis=1)
    return df

In [None]:
# Tests

def check(df1: pd.DataFrame, df2: pd.DataFrame) -> bool:
    assert set(df1.columns) == set(df2.columns)
    pd.testing.assert_frame_equal(df1[sorted(df1.columns)], df2[sorted(df2.columns)])


# Test 1
df = pd.DataFrame({
    "a": [1, 2, 3, 4, 5, 6, 7, 8],
    "b": ["a", "b", "c", "a", "b", "c", "a", "b"],
})

check(
    encode_categorical_attributes(
        df,
        {
            "b": LabelEncodingParameters(
                mapping=dict(a=3, b=2, c=1),
                remove_original_attribute=False
            )
        }
    ),
    pd.DataFrame({
        "a": [1, 2, 3, 4, 5, 6, 7, 8],
        "b": ["a", "b", "c", "a", "b", "c", "a", "b"],
        "b_label_encoding": [3, 2, 1, 3, 2, 1, 3, 2],
    })
)

# Test 2
df = pd.DataFrame({
    "a": [1, 2, 3, 4, 5, 6, 7, 8],
    "b": ["a", "b", "c", "a", "b", "c", "a", "b"],
})

check(
    encode_categorical_attributes(
        df,
        {
            "b": LabelEncodingParameters(
                mapping=dict(a=3, b=2, c=1),
                remove_original_attribute=True
            )
        }
    ),
    pd.DataFrame({
        "a": [1, 2, 3, 4, 5, 6, 7, 8],
        "b_label_encoding": [3, 2, 1, 3, 2, 1, 3, 2],
    })
)

# Test 3
df = pd.DataFrame({
    "a": [1, 2, 3, 4, 5, 6, 7, 8],
    "b": ["a", "b", "c", "a", "b", "c", "a", "b"],
})

check(
    encode_categorical_attributes(
        df,
        {
            "b": OneHotEncodingParameters(
                prefix="b_one_hot",
                remove_original_attribute=False
            )
        }
    ),
    pd.DataFrame({
        "a": [1, 2, 3, 4, 5, 6, 7, 8],
        "b": ["a", "b", "c", "a", "b", "c", "a", "b"],
        "b_one_hot_a": [1, 0, 0, 1, 0, 0, 1, 0],
        "b_one_hot_b": [0, 1, 0, 0, 1, 0, 0, 1],
        "b_one_hot_c": [0, 0, 1, 0, 0, 1, 0, 0],
    })
)

# Test 4
df = pd.DataFrame({
    "a": [1, 2, 3, 4, 5, 6, 7, 8],
    "b": ["a", "b", "c", "a", "b", "c", "a", "b"],
})

check(
    encode_categorical_attributes(
        df,
        {
            "b": OneHotEncodingParameters(
                prefix="b_one_hot",
                remove_original_attribute=True
            )
        }
    ),
    pd.DataFrame({
        "a": [1, 2, 3, 4, 5, 6, 7, 8],
        "b_one_hot_a": [1, 0, 0, 1, 0, 0, 1, 0],
        "b_one_hot_b": [0, 1, 0, 0, 1, 0, 0, 1],
        "b_one_hot_c": [0, 0, 1, 0, 0, 1, 0, 0],
    })
)

# Test 5
df = pd.DataFrame({
    "a": [1, 2, 3, 4, 5, 6, 7, 8],
    "b": ["a", "b", "c", "a", "b", "c", "a", "b"],
    "c": ["a", "b", "c", "a", "b", "c", "a", "b"],
})

check(
    encode_categorical_attributes(
        df,
        {
            "b": LabelEncodingParameters(
                mapping=dict(a=3, b=2, c=1),
                remove_original_attribute=True
            ),
            "c": OneHotEncodingParameters(
                prefix="c_one_hot",
                remove_original_attribute=True
            )
        }
    ),
    pd.DataFrame({
        "a": [1, 2, 3, 4, 5, 6, 7, 8],
        "b_label_encoding": [3, 2, 1, 3, 2, 1, 3, 2],
        "c_one_hot_a": [1, 0, 0, 1, 0, 0, 1, 0],
        "c_one_hot_b": [0, 1, 0, 0, 1, 0, 0, 1],
        "c_one_hot_c": [0, 0, 1, 0, 0, 1, 0, 0],
    })
)