In [127]:
import pandas as pd 
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted

In [158]:
# setting the seed
np.random.seed(42)

# generating the two samples
sample_a = pd.DataFrame({'sample_name': 'A',
                         'variant': np.random.choice(['a', 'b'], size=100),
                         'height': np.random.normal(loc=170, scale=5, size=100)})

sample_b = pd.DataFrame({'sample_name': 'B',
                         'variant': 'a',
                         'height': np.random.normal(loc=165, scale=7, size=100)})

# concatenating the samples
df = pd.concat([sample_a, sample_b], axis=0, ignore_index=True) \
       .sample(frac=1) \
       .reset_index(drop=True)

In [159]:
ind_to_replace = np.random.choice(range(len(df)), 20, replace=False)
df.loc[ind_to_replace, 'height'] = np.nan

In [160]:
ind_to_replace_sample_name = np.random.choice(range(len(df)), 5, replace=False)
df.loc[ind_to_replace_sample_name, "sample_name"] = np.nan

In [161]:
df_group_imputer = df.copy()

In [163]:
df_group_imputer[df_group_imputer["sample_name"].isnull() ]

Unnamed: 0,sample_name,variant,height
57,,a,156.660876
59,,b,172.020254
96,,b,165.453063
119,,a,167.697222
152,,a,161.696477


In [132]:
df_group_imputer.head(2)

Unnamed: 0,sample_name,variant,height
0,B,a,171.678012
1,B,a,166.292437


In [133]:
df.head(2)

Unnamed: 0,sample_name,variant,height
0,B,a,171.678012
1,B,a,166.292437


In [164]:
df_group_imputer.groupby(["sample_name", "variant"])["height"].mean()

sample_name  variant
A            a          170.392413
             b          169.963861
B            a          165.935600
Name: height, dtype: float64

In [135]:
df["sample_name"].fillna(-1, inplace=True)

In [136]:
df.groupby(["sample_name", "variant"])["height"].mean()

sample_name  variant
-1           a          162.018191
             b          168.736659
A            a          170.392413
             b          169.963861
B            a          165.935600
Name: height, dtype: float64

In [165]:

class GroupImputer(BaseEstimator, TransformerMixin):
    '''
    Class used for imputing missing values in a pd.DataFrame using either mean or median of a group.
    
    Parameters
    ----------    
    group_cols : list
        List of columns used for calculating the aggregated value 
    target : str
        The name of the column to impute
    metric : str
        The metric to be used for remplacement, can be one of ['mean', 'median']
    Returns
    -------
    X : array-like
        The array with imputed values in the target column
    '''
    def __init__(self, group_cols, target, metric='mean'):
        
        assert metric in ['mean', 'median'], 'Unrecognized value for metric, should be mean/median'
        assert type(group_cols) == list, 'group_cols should be a list of columns'
        assert type(target) == str, 'target should be a string'
        
        self.group_cols = group_cols
        self.target = target
        self.metric = metric
    
    def fit(self, X, y=None):
        
        #assert pd.isnull(X[self.group_cols]).any(axis=None) == False, 'There are missing values in group_cols'
        
        for col in self.group_cols:
            X[col].fillna(-1, inplace=True)
        
#         if pd.isnull(X[self.group_cols]).any(axis=None) == False:
#             for col in self.group_cols:
#                 print("the col is", col)
#                 X[col].fillna(0, inplace=True)
#                 df_imp["sample_name"].fillna(0, inplace=True)
                
#         assert pd.isnull(X[self.group_cols]).any(axis=None) == False, 'There are missing values in group_cols'
        
        impute_map = X.groupby(self.group_cols)[self.target].agg(self.metric) \
                                                            .reset_index(drop=False)
        
        for col in self.group_cols:
            X[col].replace(-1, np.nan, inplace=True)
        
        self.impute_map_ = impute_map
        
        return self 
    
    def transform(self, X, y=None):
        
        # make sure that the imputer was fitted
        check_is_fitted(self, 'impute_map_')
        
        X = X.copy()
        
        for index, row in self.impute_map_.iterrows():
            ind = (X[self.group_cols] == row[self.group_cols]).all(axis=1)
            X.loc[ind, self.target] = X.loc[ind, self.target].fillna(row[self.target])
        
        return X.values

In [166]:

imp = GroupImputer(group_cols=['sample_name', 'variant'], 
                   target='height', 
                   metric='mean')

df_imp = pd.DataFrame(imp.fit_transform(df_group_imputer), 
                      columns=df_group_imputer.columns)

print(f'df contains {sum(pd.isnull(df_group_imputer.height))} missing values.')
print(f'df_imp contains {sum(pd.isnull(df_imp.height))} missing values.')

df contains 20 missing values.
df_imp contains 0 missing values.


In [167]:
df_group_imputer.groupby(["sample_name", "variant"])["height"].mean()

sample_name  variant
A            a          170.392413
             b          169.963861
B            a          165.935600
Name: height, dtype: float64

In [156]:
df_imp.head(30)

Unnamed: 0,sample_name,variant,height
0,B,a,171.678012
1,B,a,166.292437
2,A,a,168.286427
3,B,a,158.773399
4,B,a,155.756804
5,B,a,163.435761
6,A,b,169.963861
7,A,b,168.389692
8,B,a,165.9356
9,A,a,168.827064


In [168]:
df_group_imputer[df_group_imputer["sample_name"].isnull() ]

Unnamed: 0,sample_name,variant,height
57,,a,156.660876
59,,b,172.020254
96,,b,165.453063
119,,a,167.697222
152,,a,161.696477


In [141]:
df["sample_name"].fillna(-1, inplace=True)

In [142]:
df.groupby(["sample_name", "variant"])["height"].mean()

sample_name  variant
-1           a          162.018191
             b          168.736659
A            a          170.392413
             b          169.963861
B            a          165.935600
Name: height, dtype: float64

In [144]:
df_group_imputer[df_group_imputer["sample_name"] == -1 ]

Unnamed: 0,sample_name,variant,height
57,-1,a,156.660876
59,-1,b,172.020254
96,-1,b,165.453063
119,-1,a,167.697222
152,-1,a,161.696477


In [184]:


df = pd.DataFrame({
    'id': [1, 1, 1, 2, 2, 2, 2, 3, 3],
    'Total': [50, 0, 0, 100, 0, 0, 75, 0, 0],
})

In [185]:
df

Unnamed: 0,id,Total
0,1,50
1,1,0
2,1,0
3,2,100
4,2,0
5,2,0
6,2,75
7,3,0
8,3,0


In [186]:
df.sort_values("Total", ascending=False)

Unnamed: 0,id,Total
3,2,100
6,2,75
0,1,50
1,1,0
2,1,0
4,2,0
5,2,0
7,3,0
8,3,0


In [192]:
print(7/2)

3.5


In [193]:
print(7//2)

3
