### Imports

In [None]:
import numpy as np
import pandas as pd

**NOTE:** Some of the cells below contain the magic command `%%writefile`, which saves the contents of a cell into a specified file. In the notebook provided, all such commands are commented. Please make sure to follow the steps below after you have completed the subtasks:

1. Uncomment **all** `%%writefile` commands
2. Rerun the entire notebook

This is important because, as we will check the `.py` files generated.

### Task 1: Missing value analysis

Your task is to implement a function that analyzes missing values in a given dataframe and imputes missing values if necessary. Please use the template `handle_missing_values` for the implementation.

In [None]:
%%writefile ./../solutions/missing.py
import numpy as np
import pandas as pd


def handle_missing_values(
    df: pd.DataFrame, 
    percent_missing_per_row: float = 0.5,
    percent_missing_per_column: float = 0.5,
) -> pd.DataFrame:
    """Handles missing values in a given dataframe according to the instructions below.

    Instructions for handling missing values (the steps below should be executed sequentially, one by one):
    0) Remove rows where the percentage of missing values is greater than `percent_missing_per_row`.
    1) Remove columns where the percentage of missing values is greater than `percent_missing_per_column`.
    2) For each categorical column impute missing values with the string "missing".
    3) For each numerical column impute missing values with the corresponding column-wise means.

    NOTE: It is not allowed to change the indices in the result dataframe, meaning that the initial order of rows
    (and the corresponding indices) should stay the same.
    
    HINT: You may find `.loc/.iloc` indexing useful. 

    Args:
        df: pd.DataFrame, a dataframe for handling missing values.
        percent_missing_per_row: float, the threshold for removing rows.
        percent_missing_per_column: float, the threshold for removing columns.
    Returns:
        pd.DataFrame, the resulting dataframe.
    """
    df_copy = df.copy()

    # 0) Remove rows with high missing value percentage
    df_copy = df_copy[df_copy.isnull().sum(axis=1) / df_copy.shape[1] <= percent_missing_per_row]
    df_copy = df_copy.reset_index(drop=True) # Reset index to avoid issues

    # 1) Remove columns with high missing value percentage
    df_copy = df_copy.loc[:, df_copy.isnull().sum(axis=0) / df_copy.shape[0] <= percent_missing_per_column]

    # 2) Impute missing values in categorical columns
    for col in df_copy.select_dtypes(include='object'):
        df_copy[col] = df_copy[col].fillna("missing")

    # 3) Impute missing values in numerical columns
    for col in df_copy.select_dtypes(include='number'):
        df_copy[col] = df_copy[col].fillna(df_copy[col].mean())
    
    return df_copy


In [None]:
# Tests

# Test 1
df = pd.DataFrame({
    "a": [1, 2, 3, 4, 5, 6, 7],
    "b": [2, 3, 4, 5, 6, 7, 8],
    "c": ["1", "2", "3", "4", "5", "6", "7"],
    "d": ["2", "3", "4", "5", "6", "7", "8"],
})

pd.testing.assert_frame_equal(
    handle_missing_values(df),
    pd.DataFrame({
        "a": [1, 2, 3, 4, 5, 6, 7],
        "b": [2, 3, 4, 5, 6, 7, 8],
        "c": ["1", "2", "3", "4", "5", "6", "7"],
        "d": ["2", "3", "4", "5", "6", "7", "8"],
    })
)

# Test 2
df = pd.DataFrame({
    "a": [np.nan, 2, 3, 4, 5, 6, np.nan],
    "b": [2, 3, 4, 5, 6, 7, np.nan],
    "c": [np.nan, "2", "3", "4", "5", "6", "7"],
    "d": ["2", "3", "4", "5", "6", "7", np.nan],
})

pd.testing.assert_frame_equal(
    handle_missing_values(df),
    pd.DataFrame({
        "a": [4.0, 2, 3, 4, 5, 6],
        "b": [2.0, 3, 4, 5, 6, 7],
        "c": ["missing", "2", "3", "4", "5", "6"],
        "d": ["2", "3", "4", "5", "6", "7"],
    })
)

# Test 3
df = pd.DataFrame({
    "a": [1, np.nan, 3, np.nan, np.nan, 6, np.nan],
    "b": [2, 3, 4, 5, 6, 7, 8],
    "c": [np.nan, "2", np.nan, "4", np.nan, "6", "7"],
    "d": ["2", "3", "4", "5", "6", "7", "8"],
})

pd.testing.assert_frame_equal(
    handle_missing_values(df),
    pd.DataFrame({
        "b": [2, 3, 4, 5, 6, 7, 8],
        "c": ["missing", "2", "missing", "4", "missing", "6", "7"],
        "d": ["2", "3", "4", "5", "6", "7", "8"],
    })
)

### Task 2: Identifying outliers

Your task is to implement a function that uses Tukey's fences to identify outliers in a given dataframe. Please use the template `identify_outliers` for the implementation.

In [None]:
%%writefile ./../solutions/outliers.py
import numpy as np
import pandas as pd


def identify_outliers(df: pd.DataFrame) -> pd.DataFrame:
    df_copy = df.copy()
    df_copy['outliers_count'] = 0
    for col in df_copy.select_dtypes([np.number]).columns:
        Q1 = df_copy[col].quantile(0.25)
        Q3 = df_copy[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df_copy['outliers_count'] += ((df_copy[col] < lower_bound) | (df_copy[col] > upper_bound)).astype(int)
    return df_copy

In [None]:
# Tests

# Test 1
df = pd.DataFrame({
    "a": [1, 2, 3, 4, 5, 6, 7, 8, 90000, 100000],
    "b": [-100000, -90000, 1, 2, 3, 4, 5, 6, 7, 8],
    "d": list(map(str, range(10))),
})

pd.testing.assert_frame_equal(
    identify_outliers(df),
    pd.DataFrame({
        "a": [1, 2, 3, 4, 5, 6, 7, 8, 90000, 100000],
        "b": [-100000, -90000, 1, 2, 3, 4, 5, 6, 7, 8],
        "d": list(map(str, range(10))),
        "outliers_count": [1, 1, 0, 0, 0, 0, 0, 0, 1, 1]
    })
)

# Test 2
df = pd.DataFrame({
    "a": [1, 2, 3, 4, 5, 6, 7, 8, 90000, 100000],
    "b": [-100000, -90000, 1, 2, 3, 4, 5, 6, 7, 8],
    "c": [-100000, 1, 2, 3, 4, 5, 6, 7, 8, 90000],
    "d": list(map(str, range(10))),
})

pd.testing.assert_frame_equal(
    identify_outliers(df),
    pd.DataFrame({
        "a": [1, 2, 3, 4, 5, 6, 7, 8, 90000, 100000],
        "b": [-100000, -90000, 1, 2, 3, 4, 5, 6, 7, 8],
        "c": [-100000, 1, 2, 3, 4, 5, 6, 7, 8, 90000],
        "d": list(map(str, range(10))),
        "outliers_count": [2, 1, 0, 0, 0, 0, 0, 0, 1, 2]
    })
)

### Task 3: Remove duplicated observations

Your task is to implement a function that removes duplicated observations from a given dataframe. Please use the template `remove_duplicates` for the implementation.

In [None]:
%%writefile ./../solutions/duplicates.py
import uuid
import numpy as np
import pandas as pd


def remove_duplicates(df: pd.DataFrame, tol: float=1e-5) -> pd.DataFrame:
    """Removes duplicated rows from a given dataframe.

    NOTE: The `tol` parameter should be used to compare values in numerical columns (|a - b| < tol => (a == b)).
    NOTE: If a dataframe has three rows [`a`, `b`,`c`], `a == b` and `b == c` -> only `a` should be kept. 
    NOTE: Assume that `NaN == NaN` is the expected behavior.
    NOTE: If two rows are `equal`, please keep the row with the lowest index.
    NOTE: It is not allowed to change the order of rows in the resulting dataframe.
    NOTE: It is not allowed to change the indices in the resulting dataframe, meaning that the initial order of rows
    (and the corresponding indices) should stay the same.

    Args:
        df: pd.DataFrame, a dataframe for deduplication.
        tol: float, the precision with which numerical values should be compared.
    Returns:
        pd.DataFrame, the resulting dataframe after deduplication.
    """
    df_copy = df.copy()
    
    # Convert numerical columns to object dtype for easier comparison with tolerance
    for col in df_copy.select_dtypes(include='number'):
        df_copy[col] = df_copy[col].round(decimals=-int(np.log10(tol)))

    # No need to add index, use the implicit index for drop_duplicates
    df_copy = df_copy.drop_duplicates(keep='first') 
    
    # Reset the index to match original DataFrame
    df_copy = df_copy.reset_index(drop=True)

    return df_copy

In [None]:
# Tests

# Test 1
df = pd.DataFrame({
    "a": [1, 1, 2, 3],
    "b": [1, 1, 2, 3],
    "c": ["1", "1", "2", "3"],
})

pd.testing.assert_frame_equal(
    remove_duplicates(df),
    pd.DataFrame({
        "a": [1, 2, 3],
        "b": [1, 2, 3],
        "c": ["1", "2", "3"],
    }, index=[0, 2, 3])
)

# Test 2
df = pd.DataFrame({
    "a": [np.nan, np.nan, 2, 3],
    "b": [np.nan, np.nan, 2, 3],
    "c": ["1", "1", "2", "3"],
})

pd.testing.assert_frame_equal(
    remove_duplicates(df),
    pd.DataFrame({
        "a": [np.nan, 2, 3],
        "b": [np.nan, 2, 3],
        "c": ["1", "2", "3"],
    }, index=[0, 2, 3])
)

# Test 3
df = pd.DataFrame({
    "a": [1 + 1e-4, 1 - 1e-5, 1, 2],
    "b": [2, 2 + 1e-4, 2 - 1e-5, 3],
    "c": ["1", "1", "1", "3"],
})

pd.testing.assert_frame_equal(
    remove_duplicates(df, tol=1e-3),
    pd.DataFrame({
        "a": [1 + 1e-4, 2],
        "b": [2.0, 3.0],
        "c": ["1", "3"],
    }, index=[0, 3])
)

# Test 4
df = pd.DataFrame({
    "a": [1 + 1e-1, 1 + 1e-2 - 1e-5, 1, 2],
    "b": [2 + 1e-1, 2 + 1e-2 - 1e-5, 2, 3],
    "c": ["1", "1", "1", "3"],
})

pd.testing.assert_frame_equal(
    remove_duplicates(df, tol=1e-2),
    pd.DataFrame({
        "a": [1 + 1e-1, 1 + 1e-2 - 1e-5, 2],
        "b": [2 + 1e-1, 2 + 1e-2 - 1e-5, 3],
        "c": ["1", "1", "3"],
    }, index=[0, 1, 3])
)

# Test 5
df = pd.DataFrame({
    "a": [1, 1 + 1e-3 - 1e-5, 1 - 1e-3 + 1e-5, 2],
    "b": [2, 2 + 1e-3 - 1e-5, 2 - 1e-3 + 1e-5, 3],
    "c": ["1", "1", "1", "1"],
})

pd.testing.assert_frame_equal(
    remove_duplicates(df, tol=1e-3),
    pd.DataFrame({
        "a": [1.0, 2.0],
        "b": [2.0, 3.0],
        "c": ["1", "1"],
    }, index=[0, 3])
)

# Test 6
df = pd.DataFrame({
    "a": [1 + 1e-3 - 1e-5, 1, 1 - 1e-3 + 1e-5, 2],
    "b": [2 + 1e-3 - 1e-5, 2, 2 - 1e-3 + 1e-5, 3],
    "c": ["1", "1", "1", "1"],
})

pd.testing.assert_frame_equal(
    remove_duplicates(df, tol=1e-3),
    pd.DataFrame({
        "a": [1 + 1e-3 - 1e-5, 2],
        "b": [2 + 1e-3 - 1e-5, 3],
        "c": ["1", "1"],
    }, index=[0, 3])
)

# Test 7
df = pd.DataFrame({
    "a": [1e-2, 2e-2, 3e-2],
    "b": [3e-2, 2e-2, 1e-2],
    "c": ["1", "1", "1"],
})

pd.testing.assert_frame_equal(
    remove_duplicates(df, tol=1e-2 + 1e-6),
    pd.DataFrame({
        "a": [1e-2],
        "b": [3e-2],
        "c": ["1"],
    }, index=[0])
)

# Test 8
df = pd.DataFrame({
    "a": [1e-2, 2e-2, 3e-2],
    "b": [3e-2, 2e-2, 1e-2],
})

pd.testing.assert_frame_equal(
    remove_duplicates(df, tol=1e-2 + 1e-6),
    pd.DataFrame({
        "a": [1e-2],
        "b": [3e-2],
    }, index=[0])
)

# Test 9
df = pd.DataFrame({
    "a": ["a", "b", "a"],
    "b": ["a", "b", "a"],
})

pd.testing.assert_frame_equal(
    remove_duplicates(df),
    pd.DataFrame({
        "a": ["a", "b"],
        "b": ["a", "b"],
    }, index=[0, 1])
)