In [None]:
# stepid = standardize_columns

df.columns= df.columns.str.lower().str.replace(' ', '_')

In [None]:
# stepid = rename_cols

df = df.rename(columns=input_mapper)  # input_mapper -> {"old_name": "new_name", ...}

In [None]:
# stepid = drop_duplicates

df = df.drop_duplicates()

In [None]:
# stepid = drop_cols

df = df.drop(columns=input_list)  # input_list -> [col1, col2, ...]

In [None]:
# stepid = drop_cols_high_perc_missing

def drop_columns_high_perc_missing(df, input_threshold=0.8):  # input_threshold -> float

    """
    Drop columns in the target Dataframe based on the number of missing values

    Parameters
    ----------
    df : Pandas dataframe
    threshold: float, optional
        Threshold of missing mavules percentage to be used as condition to drop the columns

    Returns
    -------
    df : Pandas dataframe
        Target dataframe with columns with a percentage of missing values above the threshold dropped. 

    """

    column_list = []

    for column in df.columns:

        nan_ratio = df[column].isna().sum() / len(df[column])

        if nan_ratio >= input_threshold:

            column_list.append(column)

    return df.drop(columns=column_list,inplace=True)

In [None]:
# stepid = convert_cols_type

# input_mapper -> dict = {"col": "data_type", ...}
df = df.astype(input_mapper) 


In [None]:
# stepid = filter_rows_by_cond
# column -> string / input_condition -> int, string, datetime, etc..
df = df[df[column] == input_condition]  

In [None]:
# stepid = filter_rows_by_index
# input_n0 -> int, left index of the slicing / input_nf -> int, right index of the slicing
df = df.iloc[input_n0:input_nf, :]

In [None]:
# stepid = set_cols_values_by_cond
# column -> string / input_condition -> int, string, datetime, etc. / input_value -> int, string, datetime, etc.
df = df[df[column] == input_condition] = input_value

In [None]:
# stepid = fill_missing_vals

def fill_missing_vals(df, mapper_input):

    """
    Fill the missing values of the target columns using the specified column.

    Parameters
    ----------
    df : Pandas dataframe
    mapper_input: dictionary
        A dictionary mapping the strategy with the columns where it should be applied

    Returns
    -------
    df : Pandas dataframe
        Target Dataframe with missing values filled

    """

    for strategy, column_list in mapper_input.items():

        imp_mean = SimpleImputer(missing_values=np.nan, strategy=strategy)  # the sklearn SimpleImputer is created
        imp_mean.fit(df[column_list])  # the SimpleImputer is fitted using the target columns
        df_target_columns_filled = imp_mean.transform(df[column_list])  # the target columns are transformed, i.e. nan values are filled
    
        df[column_list] = df_target_columns_filled  # the target columns of the main df are replaced by the filled ones

    return df

In [None]:
# stepid = remove_outliers

def remove_outliers(df):

    """
    Remove outliers of target DataFrame using Box-Plot approach.

    Parameters
    ----------
    df : Pandas dataframe

    Returns
    -------
    df : Pandas dataframe
        Target Dataframe with outliers filtered out

    """

    pct_75 = pct(df, 75)  # Calculate percentile 75 using scipy function scoreatpercentile
    pct_25 = pct(df, 25)  # Calculate percentile 25 using scipy function scoreatpercentile
    upper_bound = pct_75 + 1.5*iqr(df)  # iqr - > Scipy function to calculate the Interquartile Range
    lower_bound = pct_25 - 1.5*iqr(df)
    df = df[(df <= upper_bound) & (df >= lower_bound)]  # Filter out the outliers
    return df