# Pandas data processing

## Versions being tested
- pandas: 2.0.0
- numpy: 1.24.2

## Data processing major steps
- Data summary
- Filtering by rows
- Filtering by columns
- Create new columns
- Aggregation
- Join with other dataframe

## Notes on comparison with R data proccessing
Summarise data (dplyr::group_by then an aggregate function)
- Handling of missing values in group-by key
    - R dplyr: retain and create a new group
    - Python pandas: drop the group
    
Basic functions
- Pandas Series string handling do not handle row-wise pattern nor missing values. 
- Numpy min and numpy max when working with missing data for numeric and date types
    - R: return missing if one or more elements of the input list are missing
    - Python: ignore the missing elements and process the rest of the list like normal. If the rest of the list is empty, return 0

In [9]:
import pandas as pd
import numpy as np

In [41]:
# sample data
from sklearn.datasets import load_diabetes
diabetes = load_diabetes(as_frame=True, return_X_y=True)[0]
diabetes

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,0.085299,0.050680,0.044451,-0.005670,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.025930
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641
...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018114,0.044485
439,0.041708,0.050680,-0.015906,0.017293,-0.037344,-0.013840,-0.024993,-0.011080,-0.046883,0.015491
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044529,-0.025930


#### Data summary

In [42]:
housing.shape

(20640, 8)

In [43]:
housing.dtypes

MedInc        float64
HouseAge      float64
AveRooms      float64
AveBedrms     float64
Population    float64
AveOccup      float64
Latitude      float64
Longitude     float64
dtype: object

In [44]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup    20640 non-null  float64
 6   Latitude    20640 non-null  float64
 7   Longitude   20640 non-null  float64
dtypes: float64(8)
memory usage: 1.3 MB


In [45]:
housing.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31


#### Filter rows

In [46]:
# conditions with query
housing.query("(AveBedrms < 2) & (HouseAge > 50)")

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
5,4.0368,52.0,4.761658,1.103627,413.0,2.139896,37.85,-122.25
6,3.6591,52.0,4.931907,0.951362,1094.0,2.128405,37.84,-122.25
...,...,...,...,...,...,...,...,...
20142,1.8618,52.0,4.157718,1.073826,934.0,3.134228,34.36,-119.06
20220,4.1250,52.0,5.639798,1.057935,941.0,2.370277,34.28,-119.27
20236,2.3750,52.0,4.289720,1.046729,276.0,2.579439,34.27,-119.27
20237,3.5893,52.0,4.707463,1.023881,836.0,2.495522,34.27,-119.27


In [20]:
# conditions without query
housing[(housing['AveBedrms'] < 2) & (housing['HouseAge'] > 50)]

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
5,4.0368,52.0,4.761658,1.103627,413.0,2.139896,37.85,-122.25
6,3.6591,52.0,4.931907,0.951362,1094.0,2.128405,37.84,-122.25
...,...,...,...,...,...,...,...,...
20142,1.8618,52.0,4.157718,1.073826,934.0,3.134228,34.36,-119.06
20220,4.1250,52.0,5.639798,1.057935,941.0,2.370277,34.28,-119.27
20236,2.3750,52.0,4.289720,1.046729,276.0,2.579439,34.27,-119.27
20237,3.5893,52.0,4.707463,1.023881,836.0,2.495522,34.27,-119.27


In [17]:
# index
housing.loc[[2, 3, 4]]

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [18]:
# index location
housing.iloc[[0, 100]]

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
100,2.4912,29.0,3.7248,1.1312,2304.0,1.8432,37.81,-122.25


#### Filter columns

In [22]:
housing.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [23]:
# specific column names
housing.filter(['MedInc','HouseAge'])

Unnamed: 0,MedInc,HouseAge
0,8.3252,41.0
1,8.3014,21.0
2,7.2574,52.0
3,5.6431,52.0
4,3.8462,52.0
...,...,...
20635,1.5603,25.0
20636,2.5568,18.0
20637,1.7000,17.0
20638,1.8672,18.0


In [24]:
# column locations
housing.iloc[:, [1, 2]]

Unnamed: 0,HouseAge,AveRooms
0,41.0,6.984127
1,21.0,6.238137
2,52.0,8.288136
3,52.0,5.817352
4,52.0,6.281853
...,...,...
20635,25.0,5.045455
20636,18.0,6.114035
20637,17.0,5.205543
20638,18.0,5.329513


In [27]:
# column name with regex
housing.filter(regex = "(ude)$|(Ave)")

Unnamed: 0,AveRooms,AveBedrms,AveOccup,Latitude,Longitude
0,6.984127,1.023810,2.555556,37.88,-122.23
1,6.238137,0.971880,2.109842,37.86,-122.22
2,8.288136,1.073446,2.802260,37.85,-122.24
3,5.817352,1.073059,2.547945,37.85,-122.25
4,6.281853,1.081081,2.181467,37.85,-122.25
...,...,...,...,...,...
20635,5.045455,1.133333,2.560606,39.48,-121.09
20636,6.114035,1.315789,3.122807,39.49,-121.21
20637,5.205543,1.120092,2.325635,39.43,-121.22
20638,5.329513,1.171920,2.123209,39.43,-121.32


#### Create new columns

In [54]:
# with assign
housing_new = (
    housing
    .assign(
        Longitude_cut = lambda df: pd.cut(df['Longitude'], bins = 4, labels = ['A', 'B', 'C', 'D']),
        Longitude_qcut = lambda df: pd.qcut(df['Longitude'], 4, labels = ['A', 'B', 'C', 'D']),
    )
)

In [55]:
pd.crosstab(housing_new['Longitude_cut'], housing_new['Longitude_qcut'])

Longitude_qcut,A,B,C,D
Longitude_cut,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,4971,0,0,0
B,194,3929,0,0
C,0,1269,5132,4787
D,0,0,0,358


In [56]:
# without assign
housing['new_col'] = 1
housing

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,new_col
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,1
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,1
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,1
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,1
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,1
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,1
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,1
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,1
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,1


#### Aggregation

In [49]:
(
    housing
    .assign(
        Longitude_quantile = lambda df: pd.cut(df['Longitude'], 4, labels = ['A', 'B', 'C', 'D'])
    )
)

Longitude_quantile
C    11188
A     4971
B     4123
D      358
Name: count, dtype: int64

In [80]:
# Aggregation with pivot table
(
    housing
    .assign()
    .pivot_table(
        ['x', 'y'],
        index = ['k1', 'k2'], 
        aggfunc = {
            'x': [np.sum, np.min, np.max, np.mean, np.median, np.std],
            'y': [np.min, np.max],
        })
    .reset_index()
)

Unnamed: 0_level_0,k1,k2,x,x,x,x,x,x,y,y
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,amax,amin,mean,median,std,sum,amax,amin
0,a,1,6,0,3.0,3.0,4.242641,6,2023-06-07 23:26:32.345608,2023-06-01 23:26:32.345585
1,a,2,7,1,4.0,4.0,4.242641,8,2023-06-08 23:26:32.345610,2023-06-02 23:26:32.345596
2,a,3,8,2,5.0,5.0,4.242641,10,2023-06-09 23:26:32.345612,2023-06-03 23:26:32.345600
3,b,1,9,3,6.0,6.0,4.242641,12,2023-06-10 23:26:32.345614,2023-06-04 23:26:32.345602
4,b,2,10,4,7.0,7.0,4.242641,14,2023-06-11 23:26:32.345616,2023-06-05 23:26:32.345604
5,b,3,11,5,8.0,8.0,4.242641,16,2023-06-12 23:26:32.345618,2023-06-06 23:26:32.345606


**Most functions will ignore missing values or replace with zero**

In [81]:
np.sum([])

0.0

In [82]:
np.mean([])

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


nan

In [83]:
np.min([])

ValueError: zero-size array to reduction operation minimum which has no identity

In [84]:
np.sum([1, np.nan])

nan

In [85]:
np.mean([1, np.nan])

nan

In [86]:
np.min([1, np.nan])

nan

In [87]:
np.max([1, np.nan])

nan

In [88]:
np.min([datetime.datetime.today() + datetime.timedelta(days = x) for x in range(3)])

datetime.datetime(2023, 6, 1, 23, 27, 32, 326322)

In [89]:
np.min([datetime.datetime.today() + datetime.timedelta(days = x) for x in range(3)] + [np.nan])

TypeError: '<=' not supported between instances of 'datetime.datetime' and 'float'

####  Aggregation

In [95]:
df_altered = (
    df_test
    .assign(
        x = lambda df: np.where(df.index.isin([0, 1, 6]), np.nan, df.x),
        y = lambda df: np.where(df.index.isin([0, 1, 6]), pd.NaT, df.y),
    )
)

df_altered

Unnamed: 0,k1,k2,x,y
0,a,1,,NaT
1,a,2,,NaT
2,a,3,2.0,1685834792345600000
3,b,1,3.0,1685921192345602000
4,b,2,4.0,1686007592345604000
5,b,3,5.0,1686093992345606000
6,a,1,,NaT
7,a,2,7.0,1686266792345610000
8,a,3,8.0,1686353192345612000
9,b,1,9.0,1686439592345614000


In [96]:
(
    df_altered
    .pivot_table(
        ['x', 'y'],
        index = ['k1', 'k2'], 
        aggfunc = {
            'x': [np.sum, np.min, np.max, np.mean, np.median, np.std],
            'y': [np.min, np.max],
        })
    .reset_index()
)

Unnamed: 0_level_0,k1,k2,x,x,x,x,x,x,y,y
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,amax,amin,mean,median,std,sum,amax,amin
0,a,1,,,,,,0.0,,
1,a,2,7.0,7.0,7.0,7.0,,7.0,1.68626679234561e+18,1.68626679234561e+18
2,a,3,8.0,2.0,5.0,5.0,4.242641,10.0,1.686353192345612e+18,1.6858347923456e+18
3,b,1,9.0,3.0,6.0,6.0,4.242641,12.0,1.686439592345614e+18,1.685921192345602e+18
4,b,2,10.0,4.0,7.0,7.0,4.242641,14.0,1.686525992345616e+18,1.686007592345604e+18
5,b,3,11.0,5.0,8.0,8.0,4.242641,16.0,1.686612392345618e+18,1.686093992345606e+18


**When group by key is missing, it will be dropped from pivot_table result**

In [76]:
df_altered = (
    df_test
    .assign(k2 = lambda df: np.where(df.k2 == 2, np.nan, df.k2))
)
df_altered

Unnamed: 0,k1,k2,x
0,a,1.0,0
1,a,,1
2,a,3.0,2
3,b,1.0,3
4,b,,4
5,b,3.0,5
6,a,1.0,6
7,a,,7
8,a,3.0,8
9,b,1.0,9


In [29]:
(
    df_altered
    .pivot_table(
        'x',
        index = ['k1', 'k2'], 
        aggfunc = np.sum)
    .reset_index()
)

Unnamed: 0,k1,k2,x
0,a,1.0,6
1,a,3.0,10
2,b,1.0,12
3,b,3.0,16
