# Standardizing variables with window function in `polars`

## Window functions in `Python`/`polars`

To make a computation with a *window function* with `dplyr`, 

1. Use `select` or `with_columns` to create a new column,
2. Dot-chain `pl.col('col name)` into a summary functions like `mean`, then finally
2. use `over` with a column summary function to compute the window summary for each row.

In [1]:
import polars as pl
import polars.selectors as cs

In [2]:
data = pl.DataFrame({'Group':['a', 'a', 'b', 'b', 'b', 'c', 'c', 'c',],
                     'Data' :[5, 3, 4, 1, 2, 3, 5, 3]})
data

Group,Data
str,i64
"""a""",5
"""a""",3
"""b""",4
"""b""",1
"""b""",2
"""c""",3
"""c""",5
"""c""",3


## Window functions over the default partition

To compute statistics over the default partition, e.g., the grand mean or grand total, simply apply the desired summary method to a the column expression.

### Example 1 - Computing the grand mean, total, & SD

#### Simple aggregation

In [3]:
( data
  .select(grand_mean = pl.col('Data').mean(),
          grand_total = pl.col('Data').sum(),
          grand_SD = pl.col('Data').std(),
               )
)

grand_mean,grand_total,grand_SD
f64,i64,f64
3.25,26,1.38873


#### Grouped aggregation

In [4]:
(data
  .group_by('Group')
  .agg(group_mean = pl.col('Data').mean(),
       group_total = pl.col('Data').sum(),
       group_SD = pl.col('Data').std(),
               )
)

Group,group_mean,group_total,group_SD
str,f64,i64,f64
"""b""",2.333333,7,1.527525
"""c""",3.666667,11,1.154701
"""a""",4.0,8,1.414214


#### Column statistics (Grand statistics)

In [6]:
( data
  .with_columns(grand_mean = pl.col('Data').mean(),
                grand_total = pl.col('Data').sum(),
                grand_SD = pl.col('Data').std(),
               )
)

Group,Data,grand_mean,grand_total,grand_SD
str,i64,f64,i64,f64
"""a""",5,3.25,26,1.38873
"""a""",3,3.25,26,1.38873
"""b""",4,3.25,26,1.38873
"""b""",1,3.25,26,1.38873
"""b""",2,3.25,26,1.38873
"""c""",3,3.25,26,1.38873
"""c""",5,3.25,26,1.38873
"""c""",3,3.25,26,1.38873


#### Column statistics (Group statistics)

In [7]:
( data
  .with_columns(grand_mean = pl.col('Data').mean().over('Group'),
                grand_total = pl.col('Data').sum().over('Group'),
                grand_SD = pl.col('Data').std().over('Group'),
               )
)

Group,Data,grand_mean,grand_total,grand_SD
str,i64,f64,i64,f64
"""a""",5,4.0,8,1.414214
"""a""",3,4.0,8,1.414214
"""b""",4,2.333333,7,1.527525
"""b""",1,2.333333,7,1.527525
"""b""",2,2.333333,7,1.527525
"""c""",3,3.666667,11,1.154701
"""c""",5,3.666667,11,1.154701
"""c""",3,3.666667,11,1.154701


### Standardizing variables using window function.

Simply make an expression that includes the window function column expression.

#### Example 2 - Various standardized fields

In [8]:
std_data = (data
            .with_columns(mean_centered = pl.col('Data') - pl.col('Data').mean(),
                          z_score = (pl.col('Data') - pl.col('Data').mean())/pl.col('Data').std(),
                          percent_of_total = 100*pl.col('Data')/pl.col('Data').sum(),
                         )
           )
std_data

Group,Data,mean_centered,z_score,percent_of_total
str,i64,f64,f64,f64
"""a""",5,1.75,1.260144,19.230769
"""a""",3,-0.25,-0.180021,11.538462
"""b""",4,0.75,0.540062,15.384615
"""b""",1,-2.25,-1.620185,3.846154
"""b""",2,-1.25,-0.900103,7.692308
"""c""",3,-0.25,-0.180021,11.538462
"""c""",5,1.75,1.260144,19.230769
"""c""",3,-0.25,-0.180021,11.538462


#### Double-checking the standardization

In [9]:
(std_data
 .select(([pl.mean(c).round(5).alias(f'mean of {c}') for c in ('mean_centered', 'z_score')]
          + [pl.std(c).round(5).alias(f'SD of {c}') for c in ('z_score',)] 
          + [pl.sum(c).round(5).alias(f'total of {c}') for c in ('percent_of_total',)]
         )
 )
)

mean of mean_centered,mean of z_score,SD of z_score,total of percent_of_total
f64,f64,f64,f64
0.0,-0.0,1.0,100.0


## <font color="red"> Exercise 3.3.1 - Range-scaling the variable.</font>

Use window functions to range-scale the `Data` column using the default partition.

$$y_{range\;scaled} = \frac{y - \min{y}}{\max{y} - \min{y}}$$

Double check that the new minimum and maximum are zero and one, respectively.

In [32]:
RangeScaledData = (data
                   .with_columns([
                       ((pl.col("Data") - pl.col("Data").min()) / 
                        (pl.col("Data").max() - pl.col("Data").min())).alias("Range Scale")
                                     ])
                  )

RangeScaledData

Group,Data,Range Scale
str,i64,f64
"""a""",5,1.0
"""a""",3,0.5
"""b""",4,0.75
"""b""",1,0.0
"""b""",2,0.25
"""c""",3,0.5
"""c""",5,1.0
"""c""",3,0.5


In [34]:
RangeScaledData.group_by("Group").agg([
    pl.col("Range Scale").min().alias("MinScaled"),
    pl.col("Range Scale").max().alias("MaxScaled")
])

Group,MinScaled,MaxScaled
str,f64,f64
"""a""",0.5,1.0
"""b""",0.0,0.75
"""c""",0.5,1.0


## Computing summaries `over` a partition

To add a partition other than the default, we use the `.over` method on the column expression.

### Example 3 - Computing the group mean over `Group`

In [17]:
(data
 .with_columns(group_mean = (pl.col('Data')
                               .mean()          # Summary first
                               .over('Group')), # over MUST follow the summary method
               BAD = (pl.col('Data')
                        .over('Group')
                        .mean()), # over gets ignored here :(
              )
)

Group,Data,group_mean,BAD
str,i64,f64,f64
"""a""",5,4.0,3.25
"""a""",3,4.0,3.25
"""b""",4,2.333333,3.25
"""b""",1,2.333333,3.25
"""b""",2,2.333333,3.25
"""c""",3,3.666667,3.25
"""c""",5,3.666667,3.25
"""c""",3,3.666667,3.25


#### Example 4 - Standardizing by group

In [28]:
std_by_grp = ( data
              .with_columns(mean_centered = pl.col('Data') - pl.col('Data').mean().over('Group'),
                             z_score = (pl.col('Data') - pl.col('Data').mean().over('Group'))/pl.col('Data').std().over('Group'),
                             percent_of_total = 100*pl.col('Data')/pl.col('Data').sum().over('Group'),
                            )
             )
std_by_grp

Group,Data,mean_centered,z_score,percent_of_total
str,i64,f64,f64,f64
"""a""",5,1.0,0.707107,62.5
"""a""",3,-1.0,-0.707107,37.5
"""b""",4,1.666667,1.091089,57.142857
"""b""",1,-1.333333,-0.872872,14.285714
"""b""",2,-0.333333,-0.218218,28.571429
"""c""",3,-0.666667,-0.57735,27.272727
"""c""",5,1.333333,1.154701,45.454545
"""c""",3,-0.666667,-0.57735,27.272727


In [30]:
(std_by_grp
 .group_by('Group')
 .agg(cs.float().mean().round(5).name.prefix('mean of '),
      cs.float().std().round(5).name.prefix('SD of '),
      cs.float().sum().round(5).name.prefix('sum of '),
     )
)

Group,mean of mean_centered,mean of z_score,mean of percent_of_total,SD of mean_centered,SD of z_score,SD of percent_of_total,sum of mean_centered,sum of z_score,sum of percent_of_total
str,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""c""",0.0,0.0,33.33333,1.1547,1.0,10.49728,0.0,0.0,100.0
"""b""",-0.0,-0.0,33.33333,1.52753,1.0,21.82179,-0.0,-0.0,100.0
"""a""",0.0,0.0,50.0,1.41421,1.0,17.67767,0.0,0.0,100.0


## <font color="red"> Exercise 3.3.2 - Range-scaling `Data` by `Group`</font>

Now range-scale the `Data` column using `Group` as the partition.

$$y_{range\;scaled} = \frac{y - \min{y}}{\max{y} - \min{y}}$$

Double check that the new minimum and maximum are zero and one within each group, respectively.

In [26]:
RangeScaledByGroup = (data
                      .with_columns([
                          ((pl.col("Data") - pl.col("Data").min().over("Group")) /
                           (pl.col("Data").max().over("Group") - pl.col("Data").min().over("Group"))
                          ).alias("Range Scale by group")
                                  ])
                     )

RangeScaledByGroup
                          

Group,Data,Range Scale by group
str,i64,f64
"""a""",5,1.0
"""a""",3,0.0
"""b""",4,1.0
"""b""",1,0.0
"""b""",2,0.333333
"""c""",3,0.0
"""c""",5,1.0
"""c""",3,0.0


In [35]:
RangeScaledByGroup.group_by("Group").agg([
    pl.col("Range Scale by group").min().alias("MinScaled"),
    pl.col("Range Scale by group").max().alias("MaxScaled")
])


Group,MinScaled,MaxScaled
str,f64,f64
"""a""",0.0,1.0
"""c""",0.0,1.0
"""b""",0.0,1.0
