# Setup

In [1]:
!python --version

Python 3.9.5


In [2]:
!pip install helpsk --upgrade



In [3]:
import pandas as pd
import numpy as np
from helpsk import string

# `is` vs `==`

`==` is an equality test. It checks whether the right hand side and the left hand side are equal objects (according to their __eq__ or __cmp__ methods.)

is is an identity test. It checks whether the right hand side and the left hand side are the very same object. No methodcalls are done, objects can't influence the is operation.

You use `is` (and `is not`) for `singletons`, like `None`, where you don't care about objects that might want to pretend to be None or where you want to protect against objects breaking when being compared against None.

# List & DataFrame Equality

In [4]:
[1, 2, 3] == [1, 2, 3]

True

In [5]:
[1, 2, 3] == [1, 2, 3.00001]

False

In [6]:
['a', 'b', 'c'] == ['a', 'b', 'c']

True

In [7]:
['a', 'b', None] == ['a', 'b', None]

True

In [8]:
['a', 'b', 'c'] == ['a', 'b', None]

False

In [9]:
pd.DataFrame([[1, 2], [2, 3]]).equals(pd.DataFrame([[1, 2], [2, 3]]))

True

In [10]:
pd.DataFrame([[1, 2], [2, 3]]).equals(pd.DataFrame([[1, 2], [2, 3.001]]))

False

In [11]:
pd.DataFrame([[1, None], [2, 3.001]]).equals(pd.DataFrame([[1, None], [2, 3.001]]))

True

# Checking for Empty Containers

In [12]:
a = []

len(a) == 0  # not Pythonic

True

In [13]:
not a  # Pythonic

True

In [14]:
if len(a) == 0:
    print('Not Pythonic')

if not a:
    print('Pythonic')


Not Pythonic
Pythonic


In [15]:
a = ['a']

if len(a) != 0:
    print('Not Pythonic')

if a:
    print('Pythonic')

Not Pythonic
Pythonic


# Multi-lines

In [16]:
# not Pythonic
a = 1 + \
    2
print(a)

a = (1 +  # Pythonic
     2)
print(a)

3
3


# Classes

## Private vs Protected

In [17]:
class Example:
    def __init__(self):
        self.__private_variable = 'private'
        self._protected_variable = 'protected'
    
    def instance_method(self):
        return 'instance method ' + self.__private_variable + self._protected_variable
    
    @staticmethod
    def static_method(x):
        return 'static method ' + x
    
    @classmethod
    def class_method(cls, x):
        return 'class method ' + x

In [18]:
ex = Example()
ex._protected_variable

'protected'

In [19]:
try:
  print(ex.__private_variable)
except Exception as e:
  print(e)

'Example' object has no attribute '__private_variable'


In [20]:
ex._Example__private_variable

'private'

In [21]:
ex.instance_method()

'instance method privateprotected'

In [22]:
ex.static_method('a')  # not sure how this is useful

'static method a'

In [23]:
Example.class_method('a')  # no instance

'class method a'

## Getters / Setters

In [24]:
class ExampleClass:

    def __init__(self, my_variable=0):
        self._my_variable = my_variable

    @property
    def my_variable(self):
        """I'm the 'my_variable' property."""
        print("getter of my_variable called")
        return self._my_variable

    @my_variable.setter
    def my_variable(self, value):
        print("setter of my_variable called")
        self._my_variable = value

In [25]:
example = ExampleClass()
example.my_variable == 0

getter of my_variable called


True

In [26]:
example.my_variable = 0

setter of my_variable called


# Pandas

In [27]:
import numpy as np
import pandas as pd

dates = pd.date_range("20210809", periods=6)
df = pd.DataFrame(np.random.randn(6, 4),
                  index=dates,
                  columns=list("ABCD"))

In [28]:
dates

DatetimeIndex(['2021-08-09', '2021-08-10', '2021-08-11', '2021-08-12',
               '2021-08-13', '2021-08-14'],
              dtype='datetime64[ns]', freq='D')

In [29]:
df

Unnamed: 0,A,B,C,D
2021-08-09,1.570481,0.268844,-1.143116,0.583196
2021-08-10,-1.111954,0.915616,0.728394,-0.463511
2021-08-11,1.324238,0.854334,-0.184112,-1.052269
2021-08-12,2.429699,-0.411204,0.028047,0.226879
2021-08-13,0.751521,0.077251,0.722137,-0.255953
2021-08-14,-0.259515,-0.933734,0.454261,1.127243


In [30]:
# axis 0 indicates column-wise operations
df.apply(lambda x: x.mean(), axis=0)

A    0.784078
B    0.128518
C    0.100935
D    0.027597
dtype: float64

## Copy by Reference

In [31]:
df_2 = df

In [32]:
df.iat[0, 0] = np.nan

In [33]:
df.iloc[0, :]

A         NaN
B    0.268844
C   -1.143116
D    0.583196
Name: 2021-08-09 00:00:00, dtype: float64

In [34]:
df_2.iloc[0, :]

A         NaN
B    0.268844
C   -1.143116
D    0.583196
Name: 2021-08-09 00:00:00, dtype: float64

### Use `.copy()` for deep copy

In [35]:
df_2 = df.copy()

In [36]:
df.iat[0, 1] = np.nan

In [37]:
df.iloc[0, :]

A         NaN
B         NaN
C   -1.143116
D    0.583196
Name: 2021-08-09 00:00:00, dtype: float64

In [38]:
df_2.iloc[0, :]

A         NaN
B    0.268844
C   -1.143116
D    0.583196
Name: 2021-08-09 00:00:00, dtype: float64

## Selection

In [39]:
df.A

2021-08-09         NaN
2021-08-10   -1.111954
2021-08-11    1.324238
2021-08-12    2.429699
2021-08-13    0.751521
2021-08-14   -0.259515
Freq: D, Name: A, dtype: float64

In [40]:
df['A']  # all rows of single column

2021-08-09         NaN
2021-08-10   -1.111954
2021-08-11    1.324238
2021-08-12    2.429699
2021-08-13    0.751521
2021-08-14   -0.259515
Freq: D, Name: A, dtype: float64

In [41]:
df[0:1]  # slices rows; excludes last index

Unnamed: 0,A,B,C,D
2021-08-09,,,-1.143116,0.583196


In [42]:
df[['A']]

Unnamed: 0,A
2021-08-09,
2021-08-10,-1.111954
2021-08-11,1.324238
2021-08-12,2.429699
2021-08-13,0.751521
2021-08-14,-0.259515


If passing a single value to `.loc`, a Series is returned; if passing a list to `.loc` a DataFrame is returned

In [43]:
df.loc[pd.to_datetime('2021-08-09')]

A         NaN
B         NaN
C   -1.143116
D    0.583196
Name: 2021-08-09 00:00:00, dtype: float64

In [44]:
df.loc[[pd.to_datetime('2021-08-09')]]

Unnamed: 0,A,B,C,D
2021-08-09,,,-1.143116,0.583196


In [45]:
df.loc[[pd.to_datetime('2021-08-09'), pd.to_datetime('2021-08-10')]]

Unnamed: 0,A,B,C,D
2021-08-09,,,-1.143116,0.583196
2021-08-10,-1.111954,0.915616,0.728394,-0.463511


### `at` vs `loc`

- `df.at` is faster
- `df.at` can only access a single value at a time.
- `df.loc` can select multiple rows and/or columns.


https://stackoverflow.com/questions/37216485/pandas-at-versus-loc

## Get

method that allows you to index on a collection and return default value if the index is out of range

### Dictionary

In [46]:
{'a': 1, 'b': 2}.get('c') is None

True

In [47]:
{'a': 1, 'b': 2}.get('c', 0)

0

In [48]:
# ['c'] raises KeyError
from helpsk import validation
validation.raises_exception(lambda: {'a': 1, 'b': 2}['c'], KeyError)

True

### Series

In [49]:
dates[0]

Timestamp('2021-08-09 00:00:00', freq='D')

In [50]:
df.D.get(dates[0])

0.5831958448204407

In [51]:
df.D.get(pd.date_range('2021-01-01', periods=1)[0]) is None

True

In [52]:
df.D.get(pd.date_range('2021-01-01', periods=1)[0], np.NaN)

nan

## Vectorized Operations - Automatic Alignment

A key difference between Series and ndarray is that operations between Series automatically align the data based on label. Thus, you can write computations without giving consideration to whether the Series involved have the same labels.

https://pandas.pydata.org/pandas-docs/stable/user_guide/dsintro.html#vectorized-operations-and-label-alignment-with-series

In [53]:
s1 = pd.Series([1, 2, 3, 4, 5], index=['A', 'B', 'C', 'D', 'E'])
s2 = pd.Series([10, 20, 30, 40, 50], index=['A', 'B', 'C', 'D', 'E'])

s1 + s2

A    11
B    22
C    33
D    44
E    55
dtype: int64

In [54]:
print(s1.iloc[[4, 3, 2, 1, 0]])
s + s2

E    5
D    4
C    3
B    2
A    1
dtype: int64


NameError: name 's' is not defined

The result of an operation between unaligned Series will have the union of the indexes involved. If a label is not found in one Series or the other, the result will be marked as missing NaN. 

In [None]:
s.iloc[[2, 3, 4]] + s2.iloc[[0, 1, 2, 3]]

## Applying Functions

https://pandas.pydata.org/pandas-docs/stable/user_guide/basics.html#function-application

### Tablewise function application - Chaining Function Calls with `pipe()`

In [None]:
df_p = pd.DataFrame({"city_and_code": ["Chicago, IL", "Seattle, WA"]})
df_p

In [None]:
df_p["city_and_code"].str.split(",")

In [None]:
df_p["city_and_code"].str.split(",").str.get(1)

In [None]:
def extract_city_name(df):
    """
    Chicago, IL -> Chicago for city_name column
    """
    df["city_name"] = df["city_and_code"].str.split(",").str.get(0)
    return df

def add_country_name(df, country_name=None):
    """
    Chicago -> Chicago-US for city_name column
    """
    col = "city_name"
    df["city_and_country"] = df[col] + country_name
    return df

In [None]:
df_p.pipe(extract_city_name).pipe(add_country_name, country_name="US")

Python is pass by reference, so, unlike R, the DataFrame changes without having to make final assignment back into `df_p`. Instaed of returning the DataFrame object we are returning a pointer do the object.

In [None]:
df_p

## Pandas Aggregations

In [None]:
import pandas as pd
import numpy as np

data = pd.DataFrame({
    'col1':['a','a','a','a','a','b','b','b','b','b'],
    'col2':[10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
    'col3':[100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
})

In [None]:
data

### Use `.agg()` for simple functions on single columns

Meaning, there isn't any interaction between columns. (i.e. can't do `col2 + col3`)

In [None]:
data.groupby('col1').agg({'col2': [min, max], 'col3': [min, np.median, max]})

### Use .apply() to create interaction among multiple columns

https://stackoverflow.com/questions/10951341/pandas-dataframe-aggregate-function-using-multiple-columns

In [None]:
data

R Equivalent (although not quite because we don't get to name the column; returning column is `col1`, not `result_a`):

```
data %>%
    group_by(col1) %>%
    summarise(result_a = sum(col2 * col3))
```

In [None]:
grouped = data.groupby('col1')

def my_function(group):
    col2 = group['col2']
    col3 = group['col3']
    return (col2 * col3).sum()

result = grouped.apply(my_function)
result

In [None]:
result['b']

---

Let's remove the .sum() and just multiply across groups, to see what the result is.

In [None]:
grouped = data.groupby('col1')

def my_function(group):
    col2 = group['col2']
    col3 = group['col3']
    return (col2 * col3)#.sum()

result = grouped.apply(my_function)
result

---

### `group_by(...) %>% summarise(...)`

The following examples shows the equivalent of R's `group_by(...) %>% summarise(...)`

https://stackoverflow.com/questions/14529838/apply-multiple-functions-to-multiple-groupby-columns

Unlike the last examples, we can A) name the columns and B) interact the grouped columns.

In [None]:
data

In [None]:
def f(x):
    d = {}
    d['result_a'] = (x['col2'] * x['col3']).sum()
    d['result_b'] = x['col2'].sum() * x['col3'].sum()
    d['result_c'] = (x['col2'] + x['col3']).values
    d['count'] = x.shape[0]
    d['col2_max'] = x['col2'].max()
    d['col3_max'] = x['col3'].max()
    return pd.Series(d)

data.groupby('col1').apply(f)

which is equivalent to:

```
data %>%
    group_by(col1) %>%
    summarise(result_a = sum(col2 * col3),
              result_b = sum(col2) * sum(col3),
              ...)
```

### `group_by(...) %>% mutate(...)`

https://gist.github.com/conormm/fd8b1980c28dd21cfaf6975c86c74d07


```
df %>% group_by(group) %>% mutate(mean_var1 = mean(var1))  # R
```

In [None]:
df = pd.DataFrame(data = {'group': ['A', 'A', 'B', 'B'],
                          'var1': [1, 2, 3, 4],
                          'var2': [40, 30, 20, 10]})
df

In [None]:
df.assign(mean_var1 = lambda x: x.groupby('group')['var1'].transform('mean'))

In [None]:
# here is what the inside of the lambda function above gives without .assign()
df.groupby('group')['var1'].transform('mean')

TODO: show more advanced example of above with column interactions

In [None]:
df.groupby('group').transform('mean')

---

In [None]:
df.var1.rank(ascending=False)

In [None]:
df.groupby('group')['var1'].transform(lambda x: x.rank(ascending=False))

In [None]:


df.groupby('group').transform(lambda x: x.rank(ascending=False))

### `pivot_wider()`

#### via `pivot()`

https://stackoverflow.com/questions/40229444/trouble-pivoting-in-pandas-spread-in-r

In [None]:
df = pd.DataFrame({'site_id': {0: 'a', 1: 'a', 2: 'b', 3: 'b', 4: 'c', 5: 'c',6: 'a', 7: 'a', 8: 'b', 9: 'b', 10: 'c', 11: 'c'},
                   'dt': {0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1,6: 2, 7: 2, 8: 2, 9: 2, 10: 2, 11: 2},
                   'eu': {0: 'FGE', 1: 'WSH', 2: 'FGE', 3: 'WSH', 4: 'FGE', 5: 'WSH',6: 'FGE', 7: 'WSH', 8: 'FGE', 9: 'WSH', 10: 'FGE', 11: 'WSH'},
                   'kw': {0: '8', 1: '5', 2: '3', 3: '7', 4: '1', 5: '5',6: '2', 7: '3', 8: '5', 9: '7', 10: '2', 11: '5'}})
df

In [None]:
# notice `dt` is first so it is the first level index
df.pivot(index = ['dt', 'site_id'], values = 'kw', columns = 'eu')

---

#### via `set_index()` & 'unstack()`

In [None]:
df.set_index(['dt','site_id','eu']).unstack('eu')

---

equivalent to R's `pivot_longer(-site_id)`

In [None]:
df.set_index(['dt','site_id','eu']).unstack('site_id')

---

#### from wide to long and back

In [None]:
df_wider = df.set_index(['dt','site_id','eu']).unstack('site_id')
df_wider

In [None]:
# back to longer
df_wider.stack()

In [None]:
# multi-index select
df_wider.loc[:, ['kw']]

In [None]:
# multi-index select; pass tuple
df_wider.loc[([1, 2], 'FGE'), ('kw', ['a', 'c'])]

### `pivot_longer()`

#### `melt()`

In [None]:
df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'},
                   'B': {0: 1, 1: 3, 2: 5},
                   'C': {0: 2, 1: 4, 2: 6}})
df

In [None]:
pd.melt(df, id_vars=['A'], value_vars=['B', 'C'])

#### `wide_to_long()`

Another way to transform is to use the wide_to_long() panel data convenience function. It is less flexible than melt(), but more user-friendly.

https://pandas.pydata.org/pandas-docs/stable/user_guide/reshaping.html

> With stubnames [‘A’, ‘B’], this **function expects to find one or more group of columns with format A-suffix1, A-suffix2,…, B-suffix1, B-suffix2,…** You specify what you want to call this suffix in the resulting long format with j (for example j=’year’)

In [None]:
df = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"},
                   "A1980" : {0 : "d", 1 : "e", 2 : "f"},
                   "B1970" : {0 : 2.5, 1 : 1.2, 2 : .7},
                   "B1980" : {0 : 3.2, 1 : 1.3, 2 : .1},
                   "X"     : dict(zip(range(3), np.random.randn(3)))
                  })
df["id"] = df.index

In [None]:
df

In [None]:
pd.wide_to_long(df, ["A", "B"], i="id", j="year")

## Misc

### Series.str

Series is equipped with a set of string processing methods in the str attribute that make it easy to operate on each element of the array, as in the code snippet below. Note that pattern-matching in str generally uses regular expressions by default (and in some cases always uses them).

https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html

In [None]:
s = pd.Series(["A", "B", "C", "Aaba", "Baca", np.nan, "CABA", "dog", "cat"])
s.str.lower()

In [None]:
s.str.capitalize()

### Recipes

#### Get Top N Rows for Each Group

In [None]:
df = pd.DataFrame({'site_id': {0: 'a', 1: 'a', 2: 'b', 3: 'b', 4: 'c', 5: 'c',6: 'a', 7: 'a', 8: 'b', 9: 'b', 10: 'c', 11: 'c'},
                   'dt': {0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1,6: 2, 7: 2, 8: 2, 9: 2, 10: 2, 11: 2},
                   'eu': {0: 'FGE', 1: 'WSH', 2: 'FGE', 3: 'WSH', 4: 'FGE', 5: 'WSH',6: 'FGE', 7: 'WSH', 8: 'FGE', 9: 'WSH', 10: 'FGE', 11: 'WSH'},
                   'kw': {0: '8', 1: '5', 2: '3', 3: '7', 4: '1', 5: '5',6: '2', 7: '3', 8: '5', 9: '7', 10: '2', 11: '5'}})
df = df.sort_values(by=['site_id', 'kw'])

In [None]:
df

In [None]:
def top_n(df, n, column):
    return df.sort_values(by=column)[-n:]

top_n(df, n=2, column='kw')

In [None]:
df.groupby('site_id').apply(top_n, n=2, column='kw')

## Performance

Accelerated operations

https://pandas.pydata.org/pandas-docs/stable/user_guide/basics.html#accelerated-operations


pandas has support for accelerating certain types of binary numerical and boolean operations using the numexpr library and the bottleneck libraries.

These libraries are especially useful when dealing with large data sets, and provide large speedups. numexpr uses smart chunking, caching, and multiple cores. bottleneck is a set of specialized cython routines that are especially fast when dealing with arrays that have nans.

Here is a sample (using 100 column x 100,000 row DataFrames):

Operation
0.11.0 (ms)
Prior Version (ms)
Ratio to Prior
df1 > df2
13.32
125.35
0.1063
df1 * df2
21.71
36.63
0.5928
df1 + df2
22.04
36.50
0.6039

**You are highly encouraged to install both libraries**. See the section Recommended Dependencies for more installation info.

These are both enabled to be used by default, you can control this by setting the options:

```
pd.set_option("compute.use_bottleneck", False)
pd.set_option("compute.use_numexpr", False)
```

# Dates & Times

## `parse()`

In [None]:
from dateutil.parser import parse
parse('2021-08-14')

In [None]:
parse('2021-08-14 23:59:58')

In [None]:
dates = [parse('2021-08-14'), parse('2021-08-15'), parse('2021-08-16'), parse('2021-08-16'),
         parse('2021-09-01'), parse('2022-01-01')]
dates

In [None]:
df = pd.DataFrame({'value': [1, 2, 3, 3.1, 4, 5]}, index=dates)
df

In [None]:
df.index.is_unique

In [None]:
df.loc['2021-08-15']

In [None]:
df.loc['20210815']

In [None]:
df.loc[dates[1]]

In [None]:
df.loc['2021']

In [None]:
df.get('2020')

In [None]:
df.get('2021')

In [None]:
df.get('2021-08')

In [None]:
df.get('2022')

In [None]:
df.loc['2021-08-16']

In [None]:
df.get('2023')

In [None]:
# if ordered chronologically, you can slice with timestamp range
df['2021-08': '2021-09']

In [None]:
df.truncate(after='2021-09')

In [None]:
df.truncate(before='2021-09')

In [None]:
df.groupby(level=0).count()

## date_range()

In [None]:
pd.date_range('2021-02', '2022-01-31')

In [None]:
# returns the first day of the quarter between the range provided
pd.date_range('2021-02', '2022-01-31', freq='QS-JAN')

In [None]:
# returns the first *business* day of the quarter between the range provided
# Jan 1st 2022 falls on a saturday, the first business day is Monday the 3rd
pd.date_range('2021-02', '2022-01-31', freq='BQS-JAN')

In [None]:
# returns the first day of the fiscal quarter (first quarter of the year is FEB) 
# between the range provided
pd.date_range('2021-02', '2022-01-31', freq='QS-FEB')

In [None]:
# returns the first *business* day of the fiscal quarter (first quarter of the year is FEB) 
# between the range provided
pd.date_range('2021-02', '2022-01-31', freq='BQS-FEB')

## Offsets e.g. `MonthEnd()`

In [None]:
# M = Month End
pd.date_range('2021-02', '2021-05', freq='M')

In [None]:
# MS = Month Start
pd.date_range('2021-02', '2021-05', freq='MS')

---

In [None]:
from pandas.tseries.offsets import Day, MonthBegin, MonthEnd, QuarterBegin, QuarterEnd

In [None]:
pd.date_range('2021-02', '2021-05', freq='M')

In [None]:
# i would have expected no change if doing `+ MonthEnd()` to last day of the month but that isn't the case
pd.date_range('2021-02', '2021-05', freq='M') + MonthEnd()

In [None]:
pd.date_range('2021-02', '2021-02', freq='MS').values

In [None]:
pd.date_range('2021-02', '2021-05', freq='MS') + pd.offsets.MonthEnd(n=0) - pd.offsets.MonthBegin(n=1)

In [None]:
pd.date_range('2021-02', '2021-05', freq='M') + pd.offsets.MonthEnd(n=0) - pd.offsets.MonthBegin(n=1)

In [None]:
pd.date_range('2021-02', '2021-05', freq='MS') + MonthEnd()

In [None]:
# added MonthBegin() gives the beginning of next month
pd.date_range('2021-02', '2021-05', freq='MS') + MonthEnd() + MonthBegin()

In [None]:
# subtracting MonthBegin() gives the beginning of current month
pd.date_range('2021-02', '2021-05', freq='MS') + MonthEnd() - MonthBegin()

In [None]:
pd.date_range('2021-02', '2021-05', freq='MS') + MonthEnd(2)

---

In [None]:
pd.date_range('2021-01', '2021-12', freq='MS')

In [None]:
pd.date_range('2021-01', '2021-12', freq='MS') - MonthBegin()

In [None]:
pd.date_range('2021-01', '2021-12', freq='MS') + Day(2) - MonthBegin()

In [None]:
starting_fiscal_month = 2
quarter_end_month = starting_fiscal_month + 3
pd.date_range('2021-01', '2021-12', freq='MS') + QuarterEnd(startingMonth=1)

---

In [None]:
pd.date_range('2021-02', '2021-05', freq='MS') + Day()

In [None]:
pd.date_range('2021-02', '2021-05', freq='MS') + Day(2)

### `rollforward()` & `rollback()`

In [None]:
from helpsk import date
import datetime

---

In [None]:
MonthBegin().rollback(parse('2021-01-01'))

In [None]:
MonthBegin().rollback(parse('2021-01-31'))

In [None]:
MonthEnd().rollforward(date.ymd('2021-01-01'))

In [None]:
MonthEnd().rollforward(date.ymd('2021-01-31'))

---

In [None]:
# default value for startingMonth doesn't make sense
QuarterBegin().rollback(date.ymd('2021-01-01'))

---

### Fiscal Quarter with `QuarterBegin(startingMonth=2)`

In [None]:
dates = np.append(pd.date_range('2021-01', '2021-12-31', freq='MS').values,
                  pd.date_range('2021-01', '2021-12-31', freq='M').values)
dates.sort()
dates

In [None]:
[(date.to_string(pd.to_datetime(str(x))),
  QuarterBegin(startingMonth=1).rollback(x)) for x in dates]

In [None]:
[(date.to_string(pd.to_datetime(str(x))),
  QuarterBegin(startingMonth=2).rollback(x)) for x in dates]

### `groupby()` & `rollforward()`

In [None]:
dates = pd.date_range('2021-01-01', '2021-12-31')
dates

In [None]:
df = pd.DataFrame({'value': range(0, len(dates))}, index=dates)
df

In [None]:
from pandas.tseries.offsets import QuarterBegin

In [None]:
df = pd.DataFrame({'date': dates, 'value': range(0, len(dates))})
df

In [None]:
df['quarter_start'] = df.date.apply(QuarterBegin(startingMonth=1).rollback)
df

In [None]:
df.groupby('quarter_start').agg({'value': [min, max, len]})

In [None]:
# doesn't seem like this works, I would have thought it was the same as above
df.groupby(QuarterBegin(startingMonth=1).rollback).agg({'value': [min, max, len]})

# Unexpected Behavior

## Default Arguments

Python’s default arguments are evaluated once when the function is defined, not each time the function is called (like it is in say, Ruby). This means that if you use a mutable default argument and mutate it, you will and have mutated that object for all future calls to the function as well.
	* http://docs.python-guide.org/en/latest/writing/gotchas/
	* This means you cannot assign objects as default arguments

## Operations in general exclude missing data.

https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html#stats

In [None]:
df = pd.DataFrame(np.random.randn(6, 4),
                  columns=list("ABCD"))
df.iat[0, 0] = np.nan
df

In [None]:
df.mean()  # this works; R equivalent would have returned `NA`

In [None]:
df.mean(skipna=False)  # have to manually set skipna=False to get desired behavior

---