# Pandas data processing

## Versions being tested
- pandas: 2.0.0
- numpy: 1.24.2

## Data processing major steps
- Data summary
- Filtering by rows
- Filtering by columns
- Create new columns
- Aggregation
- Join with other dataframe

## Notes on comparison with R data proccessing
Summarise data (dplyr::group_by then an aggregate function)
- Handling of missing values in group-by key
    - R dplyr: retain and create a new group
    - Python pandas: drop the group
    
Basic functions
- Pandas Series string handling do not handle row-wise pattern nor missing values. 
- Numpy min and numpy max when working with missing data for numeric and date types
    - R: return missing if one or more elements of the input list are missing
    - Python: ignore the missing elements and process the rest of the list like normal. If the rest of the list is empty, return 0

In [9]:
import pandas as pd
import numpy as np

In [41]:
# sample data
from sklearn.datasets import load_diabetes
diabetes = load_diabetes(as_frame=True, return_X_y=True)[0]
diabetes

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,0.085299,0.050680,0.044451,-0.005670,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.025930
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641
...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018114,0.044485
439,0.041708,0.050680,-0.015906,0.017293,-0.037344,-0.013840,-0.024993,-0.011080,-0.046883,0.015491
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044529,-0.025930


#### Data summary

In [42]:
housing.shape

(20640, 8)

In [43]:
housing.dtypes

MedInc        float64
HouseAge      float64
AveRooms      float64
AveBedrms     float64
Population    float64
AveOccup      float64
Latitude      float64
Longitude     float64
dtype: object

In [44]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup    20640 non-null  float64
 6   Latitude    20640 non-null  float64
 7   Longitude   20640 non-null  float64
dtypes: float64(8)
memory usage: 1.3 MB


In [45]:
housing.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31


#### Filter rows

In [46]:
# conditions with query
housing.query("(AveBedrms < 2) & (HouseAge > 50)")

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
5,4.0368,52.0,4.761658,1.103627,413.0,2.139896,37.85,-122.25
6,3.6591,52.0,4.931907,0.951362,1094.0,2.128405,37.84,-122.25
...,...,...,...,...,...,...,...,...
20142,1.8618,52.0,4.157718,1.073826,934.0,3.134228,34.36,-119.06
20220,4.1250,52.0,5.639798,1.057935,941.0,2.370277,34.28,-119.27
20236,2.3750,52.0,4.289720,1.046729,276.0,2.579439,34.27,-119.27
20237,3.5893,52.0,4.707463,1.023881,836.0,2.495522,34.27,-119.27


In [20]:
# conditions without query
housing[(housing['AveBedrms'] < 2) & (housing['HouseAge'] > 50)]

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
5,4.0368,52.0,4.761658,1.103627,413.0,2.139896,37.85,-122.25
6,3.6591,52.0,4.931907,0.951362,1094.0,2.128405,37.84,-122.25
...,...,...,...,...,...,...,...,...
20142,1.8618,52.0,4.157718,1.073826,934.0,3.134228,34.36,-119.06
20220,4.1250,52.0,5.639798,1.057935,941.0,2.370277,34.28,-119.27
20236,2.3750,52.0,4.289720,1.046729,276.0,2.579439,34.27,-119.27
20237,3.5893,52.0,4.707463,1.023881,836.0,2.495522,34.27,-119.27


In [17]:
# index
housing.loc[[2, 3, 4]]

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [18]:
# index location
housing.iloc[[0, 100]]

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
100,2.4912,29.0,3.7248,1.1312,2304.0,1.8432,37.81,-122.25


#### Filter columns

In [22]:
housing.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [23]:
# specific column names
housing.filter(['MedInc','HouseAge'])

Unnamed: 0,MedInc,HouseAge
0,8.3252,41.0
1,8.3014,21.0
2,7.2574,52.0
3,5.6431,52.0
4,3.8462,52.0
...,...,...
20635,1.5603,25.0
20636,2.5568,18.0
20637,1.7000,17.0
20638,1.8672,18.0


In [24]:
# column locations
housing.iloc[:, [1, 2]]

Unnamed: 0,HouseAge,AveRooms
0,41.0,6.984127
1,21.0,6.238137
2,52.0,8.288136
3,52.0,5.817352
4,52.0,6.281853
...,...,...
20635,25.0,5.045455
20636,18.0,6.114035
20637,17.0,5.205543
20638,18.0,5.329513


In [27]:
# column name with regex
housing.filter(regex = "(ude)$|(Ave)")

Unnamed: 0,AveRooms,AveBedrms,AveOccup,Latitude,Longitude
0,6.984127,1.023810,2.555556,37.88,-122.23
1,6.238137,0.971880,2.109842,37.86,-122.22
2,8.288136,1.073446,2.802260,37.85,-122.24
3,5.817352,1.073059,2.547945,37.85,-122.25
4,6.281853,1.081081,2.181467,37.85,-122.25
...,...,...,...,...,...
20635,5.045455,1.133333,2.560606,39.48,-121.09
20636,6.114035,1.315789,3.122807,39.49,-121.21
20637,5.205543,1.120092,2.325635,39.43,-121.22
20638,5.329513,1.171920,2.123209,39.43,-121.32


#### Create new columns

In [54]:
# with assign
housing_new = (
    housing
    .assign(
        Longitude_cut = lambda df: pd.cut(df['Longitude'], bins = 4, labels = ['A', 'B', 'C', 'D']),
        Longitude_qcut = lambda df: pd.qcut(df['Longitude'], 4, labels = ['A', 'B', 'C', 'D']),
    )
)

In [55]:
pd.crosstab(housing_new['Longitude_cut'], housing_new['Longitude_qcut'])

Longitude_qcut,A,B,C,D
Longitude_cut,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,4971,0,0,0
B,194,3929,0,0
C,0,1269,5132,4787
D,0,0,0,358


In [56]:
# without assign
housing['new_col'] = 1
housing

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,new_col
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,1
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,1
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,1
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,1
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,1
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,1
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,1
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,1
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,1


#### Aggregation

In [74]:
# Aggregation with pivot_table basics
# - All combinations are generated
# - Multi-level naming
(
    housing
    .assign(
        Longitude_quantile = lambda df: pd.cut(df['Longitude'], 4, labels = ['A', 'B', 'C', 'D']),
        Latitude_quantile = lambda df: pd.cut(df['Latitude'], 4, labels = ['A', 'B', 'C', 'D'])
    )
    .pivot_table(
        ['AveRooms', 'AveBedrms'],
        index = ['Longitude_quantile', 'Latitude_quantile'], 
        aggfunc = {
            'AveRooms': ['count', np.sum, np.min, np.max, np.mean, np.median, np.std],
            'AveBedrms': [np.min, np.max],
        })
    .reset_index()
)

Unnamed: 0_level_0,Longitude_quantile,Latitude_quantile,AveBedrms,AveBedrms,AveRooms,AveRooms,AveRooms,AveRooms,AveRooms,AveRooms,AveRooms
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,amax,amin,amax,amin,count,mean,median,std,sum
0,A,A,,,,,0,,,,0.0
1,A,B,2.151724,0.765152,12.5,2.540984,283,5.775562,5.572414,1.412589,1634.484022
2,A,C,6.0,0.526316,22.222222,1.130435,4276,5.416697,5.304958,1.320655,23161.795068
3,A,D,3.5,0.816327,12.32,2.563745,412,5.499543,5.427429,1.042251,2265.811828
4,B,A,1.809524,0.851064,14.666667,2.388797,254,5.459935,5.614243,1.487865,1386.823522
5,B,B,10.27027,0.7,50.837838,2.059524,1239,5.434922,5.313725,1.852973,6733.867915
6,B,C,34.066667,0.444444,141.909091,1.550409,2478,6.089113,5.485254,5.152069,15088.821185
7,B,D,15.3125,0.88968,59.875,3.899811,152,7.215518,5.672994,5.898185,1096.758699
8,C,A,8.053846,0.333333,36.715385,0.846154,10613,5.177312,5.036932,1.564783,54946.812021
9,C,B,8.5,0.5,31.0625,1.0,549,5.524299,5.264637,2.379355,3032.84019


In [77]:
# Aggregation with pivot_table with missing column values
# - All row with missing index values are excluded
# - Whether the output is sorted depends on the output of the first aggfunc declared
(
    housing
    .assign(
        Longitude_quantile = lambda df: pd.cut(df['Longitude'], 4, labels = ['A', 'B', 'C', 'D']),
        Latitude_quantile = lambda df: pd.cut(df['Latitude'], 4, labels = ['A', 'B', 'C', 'D'])
    )
    .assign(
        AveRooms = lambda df: np.where(
            df.Longitude_quantile == 'A',
            np.nan,
            df.AveRooms
        )
    )
    .pivot_table(
        ['AveRooms'],
        index = ['Longitude_quantile', 'Latitude_quantile'], 
        aggfunc = ['count', lambda x: sum(x), lambda x: np.sum(x), sum, np.sum, np.nansum],
    )
    .reset_index()
)

  .reset_index()


Unnamed: 0_level_0,Longitude_quantile,Latitude_quantile,count,<lambda>,<lambda>,sum,sum,nansum
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,AveRooms,AveRooms,AveRooms.1,AveRooms,AveRooms.1,AveRooms
0,A,A,0,,,0.0,0.0,0.0
1,A,B,0,,0.0,0.0,0.0,0.0
2,A,C,0,,0.0,0.0,0.0,0.0
3,A,D,0,,0.0,0.0,0.0,0.0
4,B,A,254,1386.823522,1386.823522,1386.823522,1386.823522,1386.823522
5,B,B,1239,6733.867915,6733.867915,6733.867915,6733.867915,6733.867915
6,B,C,2478,15088.821185,15088.821185,15088.821185,15088.821185,15088.821185
7,B,D,152,1096.758699,1096.758699,1096.758699,1096.758699,1096.758699
8,C,A,10613,54946.812021,54946.812021,54946.812021,54946.812021,54946.812021
9,C,B,549,3032.84019,3032.84019,3032.84019,3032.84019,3032.84019


In [61]:
# Aggregation with pivot_table with missing index values
# - All row with missing index values are excluded
(
    housing
    .assign(
        Longitude_quantile = lambda df: pd.cut(df['Longitude'], 4, labels = ['A', 'B', 'C', 'D']),
        Latitude_quantile = lambda df: pd.cut(df['Latitude'], 4, labels = ['A', 'B', 'C', 'D'])
    )
    .assign(
        Longitude_quantile = lambda df: np.where(
            df.Longitude_quantile == 'A',
            None,
            df.Longitude_quantile
        )
    )
    .pivot_table(
        ['AveRooms', 'AveBedrms'],
        index = ['Longitude_quantile', 'Latitude_quantile'], 
        aggfunc = {
            'AveRooms': ['count', np.sum, np.min, np.max, np.mean, np.median, np.std],
            'AveBedrms': [np.min, np.max],
        })
    .reset_index()
)

Unnamed: 0_level_0,Longitude_quantile,Latitude_quantile,AveBedrms,AveBedrms,AveRooms,AveRooms,AveRooms,AveRooms,AveRooms,AveRooms,AveRooms
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,amax,amin,amax,amin,count,mean,median,std,sum
0,B,A,1.809524,0.851064,14.666667,2.388797,254,5.459935,5.614243,1.487865,1386.823522
1,B,B,10.27027,0.7,50.837838,2.059524,1239,5.434922,5.313725,1.852973,6733.867915
2,B,C,34.066667,0.444444,141.909091,1.550409,2478,6.089113,5.485254,5.152069,15088.821185
3,B,D,15.3125,0.88968,59.875,3.899811,152,7.215518,5.672994,5.898185,1096.758699
4,C,A,8.053846,0.333333,36.715385,0.846154,10613,5.177312,5.036932,1.564783,54946.812021
5,C,B,8.5,0.5,31.0625,1.0,549,5.524299,5.264637,2.379355,3032.84019
6,C,C,11.410714,0.952941,52.848214,4.279221,26,12.120278,7.758938,11.080332,315.127238
7,C,D,,,,,0,,,,0.0
8,D,A,14.111111,0.771429,62.422222,1.62963,353,6.687163,5.791667,4.091899,2360.568597
9,D,B,1.95098,1.20354,7.911765,3.714286,5,6.168879,6.441315,1.567156,30.844395


In [78]:
# Aggregation without pivot table
# - Apply to all columns other than grouping keys
(
    housing
    .assign(
        Longitude_quantile = lambda df: pd.cut(df['Longitude'], 4, labels = ['A', 'B', 'C', 'D']),
        Latitude_quantile = lambda df: pd.cut(df['Latitude'], 4, labels = ['A', 'B', 'C', 'D'])
    )
    .groupby(['Longitude_quantile', 'Latitude_quantile'])
    .sum()
)

Unnamed: 0_level_0,Unnamed: 1_level_0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,new_col
Longitude_quantile,Latitude_quantile,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
A,A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
A,B,1305.3961,8378.0,1634.484022,313.156577,339708.0,744.240966,10451.55,-34516.33,0
A,C,18729.7009,141177.0,23161.795068,4538.923774,5500625.0,12753.74477,161781.76,-522816.55,0
A,D,996.9044,10504.0,2265.811828,473.479018,452790.0,1091.299789,16729.71,-50672.31,0
B,A,1056.2787,6739.0,1386.823522,272.505981,288338.0,701.087382,8768.91,-30466.53,0
B,B,3877.5202,29937.0,6733.867915,1332.617743,1751553.0,4417.347216,45153.23,-149035.01,0
B,C,8172.5048,61228.0,15088.821185,2953.917482,3247146.0,7378.774382,94878.55,-300298.92,0
B,D,388.5333,3464.0,1096.758699,229.455783,147713.0,974.247962,6100.02,-18424.78,0
C,A,42765.0691,308440.0,54946.812021,11295.072712,16453885.0,32509.821824,359003.46,-1251802.73,0
C,B,1510.3183,13883.0,3032.84019,630.990025,732587.0,1688.679112,19605.5,-65271.38,0


In [80]:
# Aggregation without pivot table with missing values
# Rows with missing grouping keys are excluded
(
    housing
    .assign(
        Longitude_quantile = lambda df: pd.cut(df['Longitude'], 4, labels = ['A', 'B', 'C', 'D']),
        Latitude_quantile = lambda df: pd.cut(df['Latitude'], 4, labels = ['A', 'B', 'C', 'D'])
    )
     .assign(
        AveRooms = lambda df: np.where(
            df.Longitude_quantile == 'A',
            np.nan,
            df.AveRooms
        )
    )
    .groupby(['Longitude_quantile', 'Latitude_quantile'])
    .sum()
)

Unnamed: 0_level_0,Unnamed: 1_level_0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,new_col
Longitude_quantile,Latitude_quantile,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
A,A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
A,B,1305.3961,8378.0,0.0,313.156577,339708.0,744.240966,10451.55,-34516.33,0
A,C,18729.7009,141177.0,0.0,4538.923774,5500625.0,12753.74477,161781.76,-522816.55,0
A,D,996.9044,10504.0,0.0,473.479018,452790.0,1091.299789,16729.71,-50672.31,0
B,A,1056.2787,6739.0,1386.823522,272.505981,288338.0,701.087382,8768.91,-30466.53,0
B,B,3877.5202,29937.0,6733.867915,1332.617743,1751553.0,4417.347216,45153.23,-149035.01,0
B,C,8172.5048,61228.0,15088.821185,2953.917482,3247146.0,7378.774382,94878.55,-300298.92,0
B,D,388.5333,3464.0,1096.758699,229.455783,147713.0,974.247962,6100.02,-18424.78,0
C,A,42765.0691,308440.0,54946.812021,11295.072712,16453885.0,32509.821824,359003.46,-1251802.73,0
C,B,1510.3183,13883.0,3032.84019,630.990025,732587.0,1688.679112,19605.5,-65271.38,0


In [94]:
# Aggregation without pivot table with missing values
# Rows with missing grouping keys are excluded
(
    housing
    .assign(
        Longitude_quantile = lambda df: pd.cut(df['Longitude'], 4, labels = ['A', 'B', 'C', 'D']),
        Latitude_quantile = lambda df: pd.cut(df['Latitude'], 4, labels = ['A', 'B', 'C', 'D'])
    )
     .assign(
        AveRooms = lambda df: np.where(
            df.Longitude_quantile == 'A',
            np.nan,
            df.AveRooms
        )
    )
    .groupby(['Longitude_quantile', 'Latitude_quantile'])
    .agg(lambda x: np.sum(x))
)

Unnamed: 0_level_0,Unnamed: 1_level_0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,new_col
Longitude_quantile,Latitude_quantile,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
A,A,,,,,,,,,
A,B,1305.3961,8378.0,0.0,313.156577,339708.0,744.240966,10451.55,-34516.33,0.0
A,C,18729.7009,141177.0,0.0,4538.923774,5500625.0,12753.74477,161781.76,-522816.55,0.0
A,D,996.9044,10504.0,0.0,473.479018,452790.0,1091.299789,16729.71,-50672.31,0.0
B,A,1056.2787,6739.0,1386.823522,272.505981,288338.0,701.087382,8768.91,-30466.53,0.0
B,B,3877.5202,29937.0,6733.867915,1332.617743,1751553.0,4417.347216,45153.23,-149035.01,0.0
B,C,8172.5048,61228.0,15088.821185,2953.917482,3247146.0,7378.774382,94878.55,-300298.92,0.0
B,D,388.5333,3464.0,1096.758699,229.455783,147713.0,974.247962,6100.02,-18424.78,0.0
C,A,42765.0691,308440.0,54946.812021,11295.072712,16453885.0,32509.821824,359003.46,-1251802.73,0.0
C,B,1510.3183,13883.0,3032.84019,630.990025,732587.0,1688.679112,19605.5,-65271.38,0.0


In [102]:
# Aggregation without pivot table with missing values
# .sum() -> same as np.nansum(x)
# lambda x: sum(x) -> return Error unsupported operand type(s) for +: 'int' and 'NoneType'
# lambda x: np.sum(x) -> same as np.nansum(x)
(
    housing
    .assign(
        Longitude_quantile = lambda df: pd.cut(df['Longitude'], 4, labels = ['A', 'B', 'C', 'D']),
        Latitude_quantile = lambda df: pd.cut(df['Latitude'], 4, labels = ['A', 'B', 'C', 'D'])
    )
     .assign(
        AveRooms = lambda df: np.where(
            # (df.index == 20635),
            (df.Longitude_quantile == 'A') & (df.Latitude_quantile == 'B'),
            np.nan,
            df.AveRooms
        )
    )
    .groupby(['Longitude_quantile', 'Latitude_quantile'])
    .sum()
)

Unnamed: 0_level_0,Unnamed: 1_level_0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,new_col
Longitude_quantile,Latitude_quantile,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
A,A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
A,B,1305.3961,8378.0,0.0,313.156577,339708.0,744.240966,10451.55,-34516.33,0
A,C,18729.7009,141177.0,23161.795068,4538.923774,5500625.0,12753.74477,161781.76,-522816.55,0
A,D,996.9044,10504.0,2265.811828,473.479018,452790.0,1091.299789,16729.71,-50672.31,0
B,A,1056.2787,6739.0,1386.823522,272.505981,288338.0,701.087382,8768.91,-30466.53,0
B,B,3877.5202,29937.0,6733.867915,1332.617743,1751553.0,4417.347216,45153.23,-149035.01,0
B,C,8172.5048,61228.0,15088.821185,2953.917482,3247146.0,7378.774382,94878.55,-300298.92,0
B,D,388.5333,3464.0,1096.758699,229.455783,147713.0,974.247962,6100.02,-18424.78,0
C,A,42765.0691,308440.0,54946.812021,11295.072712,16453885.0,32509.821824,359003.46,-1251802.73,0
C,B,1510.3183,13883.0,3032.84019,630.990025,732587.0,1688.679112,19605.5,-65271.38,0


In [79]:
# Aggregation without pivot table with NA grouping keys
# Rows with missing grouping keys are excluded
(
    housing
    .assign(
        Longitude_quantile = lambda df: pd.cut(df['Longitude'], 4, labels = ['A', 'B', 'C', 'D']),
        Latitude_quantile = lambda df: pd.cut(df['Latitude'], 4, labels = ['A', 'B', 'C', 'D'])
    )
    .assign(
        Longitude_quantile = lambda df: np.where(
            df.Longitude_quantile == 'A',
            None,
            df.Longitude_quantile
        )
    )
    .groupby(['Longitude_quantile', 'Latitude_quantile'])
    .sum()
)

Unnamed: 0_level_0,Unnamed: 1_level_0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,new_col
Longitude_quantile,Latitude_quantile,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
B,A,1056.2787,6739.0,1386.823522,272.505981,288338.0,701.087382,8768.91,-30466.53,0
B,B,3877.5202,29937.0,6733.867915,1332.617743,1751553.0,4417.347216,45153.23,-149035.01,0
B,C,8172.5048,61228.0,15088.821185,2953.917482,3247146.0,7378.774382,94878.55,-300298.92,0
B,D,388.5333,3464.0,1096.758699,229.455783,147713.0,974.247962,6100.02,-18424.78,0
C,A,42765.0691,308440.0,54946.812021,11295.072712,16453885.0,32509.821824,359003.46,-1251802.73,0
C,B,1510.3183,13883.0,3032.84019,630.990025,732587.0,1688.679112,19605.5,-65271.38,0
C,C,85.3486,509.0,315.127238,69.540884,21243.0,64.879654,976.73,-3087.79,0
C,D,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
D,A,990.739,6803.0,2360.568597,518.144267,477849.0,1039.297469,11815.08,-40946.37,0
D,B,12.3361,57.0,30.844395,7.570842,8403.0,14.901965,177.12,-580.0,0
