In [None]:
import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt

# set figure size to size of jupyter column size
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['figure.dpi'] = 120

# ignore filterwarnings
import warnings
warnings.filterwarnings("ignore")

# Let´s start: Idea and Exercise

During a machine learning course, the teacher assigned an exercise to review Python classes. Since I initially reverted to Pandas and was told to use NumPy, I turned it into a small competition.

## Exercise

    1 - Code a class "Imputer" containing:      
    methods: fit and transform      
    attributs: strategy ("median": replace NaN with calculated median, "mult": multiply first column with a factor)

    2 - Make up a matrix consisting of 5x2 values.      
    Simulate the absence of values by substituting NaN in some places.
    
    3 - Write a "group imputer" class .    
    The class should have categorical target variables.    
    The class is to split the dataset into several subgroups, each of which has y the same value.    
    Missing values are to be replaced by the mean value of the respective group. 

# Change the task to make it a little more interesting

Instead of an imaginary matrix I use the "housing" data set:

In [None]:
df = pd.read_csv("../input/california-housing-prices/housing.csv")

# read only the first four lines per category (easier to check results)
df1 = pd.DataFrame()
for location in df["ocean_proximity"].unique():
    df1 = df1.append(df.loc[df["ocean_proximity"] == location][:4])

# reset index
df1 = df1.reset_index()
df = df1.drop(columns="index")
df

In [None]:
# fill the diagonal with NaNs
for i, c in zip(df.index, df.columns[:-1]):
    df.at[i, c] = df.at[i+len(df.columns)-1, c] = np.nan

df

That worked out quite well. Now we can start with the imputer class.

## Pandas

In [None]:
class Imputer_p:
    def __init__(self, strategy_="median", f=1):
        """
        Imputer_p (Pandas):
        
        strategy_:{"median", tbd} default=’median’
        Specifies the strategy to calucate the statistics for replacing NaNs
        
        f: int, default=1
        Factor to multiply the first column
        """
        self.strategy_ = strategy_
        self.factor = f

    def fit(self, df):
        if self.strategy_ == "median":
            self.statistics_ = df.median()

    def transform(self, df_):
        # we don´t want to mutate the DataFrame itself but return a mutated copy
        df = df_.copy()

        if self.strategy_ == "median":
            for column_ in df.columns:
                df.loc[df[column_].isna(), column_] = self.statistics_[column_]
        
        elif self.strategy_ == "mult":
            df[df.columns[0]] *= self.factor
            
        return df

In [None]:
# instantiate an pandas imputer
imput_p = Imputer_p()
imput_p.fit(df)

p1 = imput_p.transform(df[df.columns[:-1]])
p1

The class just written calculates the median based on the whole column, but we want the median to be calculated within a category (last column).    
For this we require the Group_Imputer:

In [None]:
class Group_Imputer_p(Imputer_p):
    def __init__(self, strategy_="median", f=None):
        super().__init__(strategy_="median", f=None)
        
    def fit(self, df_, y=-1):
        """ y index of column with depended variabls default -1 (last one)"""
          
        df = df_[df_.columns[:y]].copy()
        self.categories = df_[df_.columns[y]].copy()
        self.stack = []
        
        for category in self.categories.unique():
            self.stack.append(df.loc[self.categories == category])
    
    def transform(self):
        df = pd.DataFrame()
        for astack in self.stack:
            super().fit(astack)
            df = df.append(super().transform(astack))
        return df

In [None]:
group_imput_p = Group_Imputer_p(strategy_="median")

group_imput_p.fit(df)
p2 = group_imput_p.transform()

In [None]:
p2 - p1

When comparing, it is noticeable that median_house_value has very large differences. This is also logical, because it depends very much on the location, i.e. the independent value, which we have taken into account in the secound calculation.

# NumPy:

In [None]:
X = df[df.columns[:-1]].to_numpy()
y = df[df.columns[-1]].to_numpy()

In [None]:
class Imputer_n:
    def __init__(self, stategy_="median", f=1):
        self.stategy_ = stategy_
        self.factor = f

    def fit(self, X):
        if self.stategy_ == "median":
            self.statistics_ = np.nanmedian(X, axis=0)

    def transform(self, X):
        X_ = X.copy()

        if self.stategy_ == "median":
            X_ = np.where(np.isnan(X_), self.statistics_, X_)
        
        elif self.stategy_ == "mult":
            X_[:, 0] *= self.factor
            
        return X_

In [None]:
imput_n = Imputer_n("median", 90)
imput_n.fit(X)

n1 = imput_n.transform(X)
n1 - p1.to_numpy() # Both provide the same result

In [None]:
class Group_Imputer_n(Imputer_n):
    def __init__(self, stategy_="median", f=None):
        super().__init__(stategy_="median", f=None)
        
    def fit(self, X, y):
        X_ = X.copy()
        y_ = y.copy()
        self.stacks = []
        
        # to get the same order as in Pandas
        _, idx = np.unique(y_, return_index=True)
        
        for category in y_[np.sort(idx)]:
            self.stacks.append(X_[np.where(y_ == category)])
    
    def transform(self):
        X = []
        for astack in self.stacks:
            super().fit(astack)
            X.append(super().transform(astack))
            
        return np.concatenate(X)

In [None]:
group_imput_n = Group_Imputer_n(stategy_="median")

group_imput_n.fit(X, y)
n2 = group_imput_n.transform()

In [None]:
n2 - p2.to_numpy()

# Which one is faster? Pandas vs. NumPy
Now we would like to compare both classes. For this we use a decorator.

In [None]:
from time import time
comp = []

# decorator for runtime measurement
def timer_func(func):
    def wrap_func(*args, **kwargs):
        t1 = time()
        result = func(*args, **kwargs)
        t2 = time()
        comp.append(t2-t1)
        return result
    return wrap_func

In [None]:
class Group_Imputer_p(Imputer_p):
    def __init__(self, strategy_="median", f=None):
        super().__init__(strategy_="median", f=None)
        
    def fit(self, df_, y=-1):
        """ y index of column with depended variabls default -1 (last one)"""
          
        df = df_[df_.columns[:y]].copy()
        self.categories = df_[df_.columns[y]].copy()
        self.stack = []
        
        for category in self.categories.unique():
            self.stack.append(df.loc[self.categories == category])
            
    @timer_func
    def transform(self):
        df = pd.DataFrame()
        for astack in self.stack:
            super().fit(astack)
            df = df.append(super().transform(astack))
        return df

    
class Group_Imputer_n(Imputer_n):
    def __init__(self, stategy_="median", f=None):
        super().__init__(stategy_="median", f=None)
        
    def fit(self, X, y):
        X_ = X.copy()
        y_ = y.copy()
        self.stacks = []
        
        # to get the same order as in Pandas
        _, idx = np.unique(y_, return_index=True)
        
        for category in y_[np.sort(idx)]:
            self.stacks.append(X_[np.where(y_ == category)])
            
    @timer_func
    def transform(self):
        X = []
        for astack in self.stacks:
            super().fit(astack)
            X.append(super().transform(astack))
        return np.concatenate(X)

In [None]:
# Inflate data set to 1.3 million rows
df = pd.read_csv("../input/california-housing-prices/housing.csv")
for _ in range(6):
    df = df.append(df)
df = df.reset_index().drop(columns="index")
len(df)

In [None]:
# fill the diagonal with NaNs
df1 = df
for i, c in zip(df1.index, df1.columns[:-1]):
    l = len(df1.columns[:-1])
    for j in range(int(len(df1) / l)):
        df1.at[i + j * l, c] = np.nan

In [None]:
compare = pd.DataFrame(
    data={'Pandas': [], 'NumPy': []})

group_imput_p = Group_Imputer_p()
group_imput_n = Group_Imputer_n()

# 30 runs 
for i in range(1000, len(df1), int(len(df1)/30)):
    df = df1[:i]

    X = df[df.columns[:-1]].to_numpy()
    y = df[df.columns[-1]].to_numpy()

    comp.clear()

    group_imput_p.fit(df)
    group_imput_n.fit(X, y)

    for group_imputer, name in zip([group_imput_p, group_imput_n], compare.columns):
        comp.clear()
        # the transform method is executed 5 times and then the average value is saved
        for _ in range(5):
            _ = group_imputer.transform()
        compare.loc[i, name] = np.mean(np.array(comp))

In [None]:
plt.plot(compare.index, compare["Pandas"], label="Pandas" , marker='o')
plt.plot(compare.index, compare["NumPy"], label="NumPy", marker='o')
plt.xlabel("Number of lines")
plt.ylabel("Time in Secounds")
plt.legend(loc='upper left')

We get what many sources on the Internet also report: Starting at about 500,000 lines, Pandas becomes faster.