In [2]:
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.experimental import enable_iterative_imputer

In [3]:
def read_data(train_path, test_path):
    """Read in train and test data for a kaggle competition.
    Parameters
    ----------
    train_path : str
        The path to the training data.
    test_path : str
        The path to the test data
    Returns
    -------
    train_df, test_df : pandas DataFrames
        The train and test datasets.
    """
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

In [4]:
def get_duplicates(df, drop_cols=None):
    """Determine and return the duplicated values in a dataframe.
    Parameters
    ----------
    df : pandas DataFrame
        The dataframe to check
    drop_cols : str or list of str, optional
        The columns to drop before returning duplicates.
    Returns
    -------
    pandas DataFrame
        A dataframe containing the rows with duplicated values.
    """
    if drop_cols is not None:
        return df[df.drop(columns=drop_cols).duplicated()]
    else:
        return df[df.duplicated()]

In [5]:
def remove_duplicates(df, drop_cols=None):
    """Removes the duplicated values in a dataframe.
    Parameters
    ----------
    df : pandas DataFrame
        The dataframe to check
    drop_cols : str or list of str, optional
        The columns to drop before removing duplicates.
    Returns
    -------
    pandas DataFrame
        A dataframe without duplicated.
    """
    df_clean = df.copy()

    if drop_cols is not None:
        df_clean = df_clean[~df_clean.drop(columns=drop_cols).duplicated()]
    else:
        df_clean = df_clean[~df_clean.duplicated()]

    return df_clean.reset_index(drop=True)

In [6]:
def count_missing(df):
    """Counts the missing data in a dataframe.
    Parameters
    ----------
    df : pandas DataFrame
        The dataframe to count the missing data in.
    Returns
    -------
    pandas DataFrame
        A summary of missing data (counts and %)
    """
    missing_df = pd.DataFrame(
        df.isna().sum().sort_values(ascending=False), columns=["count"]
    )
    missing_df["percent"] = missing_df["count"] / df.shape[0]
    return missing_df.query("count != 0")