In [1]:
"""
Module that is used to remove or replace all rows with null values in them.
"""
# pylint: disable=R0913
# pylint: disable=W0612
import pandas as pd
import numpy as np


def __find_average(data_frame: pd.DataFrame) -> (dict, dict, dict):
    """
    A helper method to find average values: mean, median
    This method is a helper method for the main No-Null-dataset (NND) method that
    receives a DataFrame in which it find a mean and median for every numeric column.
    :param data_frame: DataFrame to find average of columns of
    :return: returns a tuple consisting of mean and median
    """

    mean = {}
    median = {}
    mode = {}

    for column in list(data_frame):
        if np.issubdtype(data_frame[column].dtype, np.number):
            mean[column] = round(data_frame.loc[:, column].mean(), 2)
            median[column] = round(data_frame.loc[:, column].median(), 2)

        mode[column] = data_frame.loc[:, column].mode()

    return mean, median, mode


def nnd(data_frame, strategy='median', keep_rows=None, remove_rows=None,
        reindex=True, drop=False) -> pd.DataFrame:
    """A method to deal with missing values in the dataset.
    No-null-dataset(NND) is a method that accepts a dataset with null values
    and replaces the rows with missing data with average (median, mean) or removes them.
        :param data_frame: The data to be preprocessed
        :param strategy: The way the method should handle rows with missing values (median,
        mean, mode)
        :param keep_rows: Specify the rows to keep in case 'remove' method was chosen,
        otherwise it has no effect. keep_rows has a priority over remove_rows.
        :param remove_rows: Specify the rows to remove. Can be used with all methods.
        :param reindex: A new dataset will create new indexes if True.
        :param drop: Removes all the rows that contain null values, except for those in keep_rows
        :return: Dataset with no null values
    """

    if keep_rows is None:
        keep_rows = []

    if remove_rows is None:
        remove_rows = []

    if drop:
        remove_rows = data_frame[data_frame.isnull().any(axis=1)].index.values

    remove_rows = [x for x in remove_rows if x not in keep_rows]
    data_after_drop = data_frame.drop(remove_rows, axis=0)

    mean, median, mode = __find_average(data_after_drop)

    for index, row in data_after_drop.iterrows():
        for column, value in row.items():
            if pd.isnull(row[column]):

                if np.issubdtype(data_frame[column].dtype, np.number):

                    if strategy == 'mean':
                        data_after_drop.at[index, column] = mean[column]
                    elif strategy == 'mode':
                        data_after_drop.at[index, column] = mode[column][0]
                    else:
                        data_after_drop.at[index, column] = median[column]

                else:
                    data_after_drop.at[index, column] = mode[column][0]

    no_null_dataset = data_after_drop.reset_index(drop=True) if reindex else data_after_drop

    return no_null_dataset

In [4]:
import pandas as pd 

def load_data(path): 
    data = pd.read_csv(path, index_col = 0)
    return data

data = load_data("house-prices.csv")
print (data)

nnd(data)

      MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
Id                                                                      
1242          20       RL         83.0     9849   Pave   NaN      Reg   
1233          90       RL         70.0     9842   Pave   NaN      Reg   
1401          50       RM         50.0     6000   Pave   NaN      Reg   
1377          30       RL         52.0     6292   Pave   NaN      Reg   
208           20       RL          NaN    12493   Pave   NaN      IR1   
...          ...      ...          ...      ...    ...   ...      ...   
1190          60       RL         60.0     7500   Pave   NaN      Reg   
192           60       RL          NaN     7472   Pave   NaN      IR1   
990           60       FV         65.0     8125   Pave   NaN      Reg   
982           60       RL         98.0    12203   Pave   NaN      IR1   
862          190       RL         75.0    11625   Pave   NaN      Reg   

     LandContour Utilities LotConfig  ... PoolArea

  return np.nanmean(a, axis, out=out, keepdims=keepdims)


Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,20,RL,83.0,9849,Pave,Grvl,Reg,Lvl,AllPub,Inside,...,0,,MnPrv,Shed,0,6,2007,New,Partial,248328
1,90,RL,70.0,9842,Pave,Grvl,Reg,Lvl,AllPub,FR2,...,0,,MnPrv,Shed,0,3,2007,WD,Normal,101800
2,50,RM,50.0,6000,Pave,Grvl,Reg,Lvl,AllPub,Corner,...,0,,MnPrv,Shed,0,7,2008,WD,Normal,120000
3,30,RL,52.0,6292,Pave,Grvl,Reg,Bnk,AllPub,Inside,...,0,,MnPrv,Shed,0,4,2008,WD,Normal,91000
4,20,RL,68.0,12493,Pave,Grvl,IR1,Lvl,AllPub,Inside,...,0,,GdWo,Shed,0,4,2008,WD,Normal,141000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,60,RL,60.0,7500,Pave,Grvl,Reg,Lvl,AllPub,Inside,...,0,,MnPrv,Shed,0,6,2010,WD,Normal,189000
996,60,RL,68.0,7472,Pave,Grvl,IR1,Lvl,AllPub,CulDSac,...,0,,MnPrv,Shed,0,6,2007,WD,Normal,184000
997,60,FV,65.0,8125,Pave,Grvl,Reg,Lvl,AllPub,Inside,...,0,,MnPrv,Shed,0,8,2006,New,Partial,197000
998,60,RL,98.0,12203,Pave,Grvl,IR1,Lvl,AllPub,Corner,...,0,,MnPrv,Shed,0,7,2009,WD,Normal,336000
