# Setup

In [1]:
!python --version

Python 3.9.5


In [2]:
!pip install helpsk --upgrade



In [3]:
import pandas as pd
import numpy as np
from helpsk import string

# `is` vs `==`

`==` is an equality test. It checks whether the right hand side and the left hand side are equal objects (according to their __eq__ or __cmp__ methods.)

is is an identity test. It checks whether the right hand side and the left hand side are the very same object. No methodcalls are done, objects can't influence the is operation.

You use `is` (and `is not`) for `singletons`, like `None`, where you don't care about objects that might want to pretend to be None or where you want to protect against objects breaking when being compared against None.

# Lists / Iterators / Generators

## Iterator / Generator - Already Exhausted

In [8]:
my_iterator = iter(range(10))

print(list(my_iterator))
# When iterating over an iterator/generatored that has already riased a
# StopIteration exception, you wont' get any results the second time.
print(list(my_iterator))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[]


## List & DataFrame Equality

In [4]:
[1, 2, 3] == [1, 2, 3]

True

In [5]:
[1, 2, 3] == [1, 2, 3.00001]

False

In [6]:
['a', 'b', 'c'] == ['a', 'b', 'c']

True

In [7]:
['a', 'b', None] == ['a', 'b', None]

True

In [8]:
['a', 'b', 'c'] == ['a', 'b', None]

False

In [9]:
pd.DataFrame([[1, 2], [2, 3]]).equals(pd.DataFrame([[1, 2], [2, 3]]))

True

In [10]:
pd.DataFrame([[1, 2], [2, 3]]).equals(pd.DataFrame([[1, 2], [2, 3.001]]))

False

In [11]:
pd.DataFrame([[1, None], [2, 3.001]]).equals(pd.DataFrame([[1, None], [2, 3.001]]))

True

## Checking for Empty Containers

In [12]:
a = []

len(a) == 0  # not Pythonic

True

In [13]:
not a  # Pythonic

True

In [14]:
if len(a) == 0:
    print('Not Pythonic')

if not a:
    print('Pythonic')


Not Pythonic
Pythonic


In [15]:
a = ['a']

if len(a) != 0:
    print('Not Pythonic')

if a:
    print('Pythonic')

Not Pythonic
Pythonic


## Get

method that allows you to index on a collection and return default value if the index is out of range

### Dictionary

In [19]:
{'a': 1, 'b': 2}.get('c') is None

True

In [20]:
{'a': 1, 'b': 2}.get('c', 0)

0

In [21]:
# ['c'] raises KeyError
from helpsk import validation
validation.raises_exception(lambda: {'a': 1, 'b': 2}['c'], KeyError)

True

# Itertools

In [13]:
from itertools import product, permutations, combinations, combinations_with_replacement

In [18]:
a = [1, 2, 3]
b = ['a', 'b', 'c']

In [19]:
list(product(a, b))

[(1, 'a'),
 (1, 'b'),
 (1, 'c'),
 (2, 'a'),
 (2, 'b'),
 (2, 'c'),
 (3, 'a'),
 (3, 'b'),
 (3, 'c')]

In [21]:
list(permutations(a, 2))

[(1, 2), (1, 3), (2, 1), (2, 3), (3, 1), (3, 2)]

In [25]:
list(permutations(a))

[(1, 2, 3), (1, 3, 2), (2, 1, 3), (2, 3, 1), (3, 1, 2), (3, 2, 1)]

In [26]:
list(permutations(a, 3))

[(1, 2, 3), (1, 3, 2), (2, 1, 3), (2, 3, 1), (3, 1, 2), (3, 2, 1)]

In [34]:
# without repeat, so won't include e.g. (1, 1), (2, 2), etc.
list(combinations(a, 2))

[(1, 2), (1, 3), (2, 3)]

In [29]:
list(combinations(a, 3))

[(1, 2, 3)]

In [35]:
list(combinations_with_replacement(a, 2))

[(1, 1), (1, 2), (1, 3), (2, 2), (2, 3), (3, 3)]

# Multi-lines

In [16]:
# not Pythonic
a = 1 + \
    2
print(a)

a = (1 +  # Pythonic
     2)
print(a)

3
3


# Strings

## Formatting (F-Strings)

options: https://docs.python.org/3/library/string.html

In [202]:
my_variable = "Some value"
another_variable = 987654321.123456789

In [204]:
f"The value of '{my_variable:<20}' is {another_variable}. {{Also, here's how to espace the bracks.}}"

"The value of 'Some value          ' is 987654321.1234568. {Also, here's how to espace the bracks.}"

In [221]:
f"The value of '{my_variable}' is {another_variable:,.5f}. {{Also, here's how to espace the bracks.}}"

"The value of 'Some value' is 987,654,321.12346. {Also, here's how to espace the bracks.}"

# Classes

## Private vs Protected

In [17]:
class Example:
    def __init__(self):
        self.__private_variable = 'private'
        self._protected_variable = 'protected'
    
    def instance_method(self):
        return 'instance method ' + self.__private_variable + self._protected_variable
    
    @staticmethod
    def static_method(x):
        return 'static method ' + x
    
    @classmethod
    def class_method(cls, x):
        return 'class method ' + x

In [18]:
ex = Example()
ex._protected_variable

'protected'

In [19]:
try:
  print(ex.__private_variable)
except Exception as e:
  print(e)

'Example' object has no attribute '__private_variable'


In [20]:
ex._Example__private_variable

'private'

In [21]:
ex.instance_method()

'instance method privateprotected'

In [22]:
ex.static_method('a')  # not sure how this is useful

'static method a'

In [23]:
Example.class_method('a')  # no instance

'class method a'

## Getters / Setters

In [24]:
class ExampleClass:

    def __init__(self, my_variable=0):
        self._my_variable = my_variable

    @property
    def my_variable(self):
        """I'm the 'my_variable' property."""
        print("getter of my_variable called")
        return self._my_variable

    @my_variable.setter
    def my_variable(self, value):
        print("setter of my_variable called")
        self._my_variable = value

In [6]:
example = ExampleClass()
example.my_variable == 0

getter of my_variable called


True

In [7]:
example.my_variable = 0

setter of my_variable called


# Dates & Times

## `parse()`

In [98]:
from dateutil.parser import parse
parse('2021-08-14')

datetime.datetime(2021, 8, 14, 0, 0)

In [99]:
parse('2021-08-14 23:59:58')

datetime.datetime(2021, 8, 14, 23, 59, 58)

In [100]:
dates = [parse('2021-08-14'), parse('2021-08-15'), parse('2021-08-16'), parse('2021-08-16'),
         parse('2021-09-01'), parse('2022-01-01')]
dates

[datetime.datetime(2021, 8, 14, 0, 0),
 datetime.datetime(2021, 8, 15, 0, 0),
 datetime.datetime(2021, 8, 16, 0, 0),
 datetime.datetime(2021, 8, 16, 0, 0),
 datetime.datetime(2021, 9, 1, 0, 0),
 datetime.datetime(2022, 1, 1, 0, 0)]

In [101]:
df = pd.DataFrame({'value': [1, 2, 3, 3.1, 4, 5]}, index=dates)
df

Unnamed: 0,value
2021-08-14,1.0
2021-08-15,2.0
2021-08-16,3.0
2021-08-16,3.1
2021-09-01,4.0
2022-01-01,5.0


In [102]:
df.index.is_unique

False

In [103]:
df.loc['2021-08-15']

value    2.0
Name: 2021-08-15 00:00:00, dtype: float64

In [104]:
df.loc['20210815']

value    2.0
Name: 2021-08-15 00:00:00, dtype: float64

In [105]:
df.loc[dates[1]]

value    2.0
Name: 2021-08-15 00:00:00, dtype: float64

In [106]:
df.loc['2021']

Unnamed: 0,value
2021-08-14,1.0
2021-08-15,2.0
2021-08-16,3.0
2021-08-16,3.1
2021-09-01,4.0


In [107]:
df.get('2020')

In [108]:
df.get('2021')

  return self[key]


Unnamed: 0,value
2021-08-14,1.0
2021-08-15,2.0
2021-08-16,3.0
2021-08-16,3.1
2021-09-01,4.0


In [109]:
df.get('2021-08')

  return self[key]


Unnamed: 0,value
2021-08-14,1.0
2021-08-15,2.0
2021-08-16,3.0
2021-08-16,3.1


In [110]:
df.get('2022')

  return self[key]


Unnamed: 0,value
2022-01-01,5.0


In [111]:
df.loc['2021-08-16']

Unnamed: 0,value
2021-08-16,3.0
2021-08-16,3.1


In [112]:
df.get('2023')

In [113]:
# if ordered chronologically, you can slice with timestamp range
df['2021-08': '2021-09']

Unnamed: 0,value
2021-08-14,1.0
2021-08-15,2.0
2021-08-16,3.0
2021-08-16,3.1
2021-09-01,4.0


In [114]:
df.truncate(after='2021-09')

Unnamed: 0,value
2021-08-14,1.0
2021-08-15,2.0
2021-08-16,3.0
2021-08-16,3.1
2021-09-01,4.0


In [115]:
df.truncate(before='2021-09')

Unnamed: 0,value
2021-09-01,4.0
2022-01-01,5.0


In [116]:
df.groupby(level=0).count()

Unnamed: 0,value
2021-08-14,1
2021-08-15,1
2021-08-16,2
2021-09-01,1
2022-01-01,1


## date_range()

In [159]:
pd.date_range('2021-02', '2022-01-31')

DatetimeIndex(['2021-02-01', '2021-02-02', '2021-02-03', '2021-02-04',
               '2021-02-05', '2021-02-06', '2021-02-07', '2021-02-08',
               '2021-02-09', '2021-02-10',
               ...
               '2022-01-22', '2022-01-23', '2022-01-24', '2022-01-25',
               '2022-01-26', '2022-01-27', '2022-01-28', '2022-01-29',
               '2022-01-30', '2022-01-31'],
              dtype='datetime64[ns]', length=365, freq='D')

In [118]:
# returns the first day of the quarter between the range provided
pd.date_range('2021-02', '2022-01-31', freq='QS-JAN')

DatetimeIndex(['2021-04-01', '2021-07-01', '2021-10-01', '2022-01-01'], dtype='datetime64[ns]', freq='QS-JAN')

In [119]:
# returns the first *business* day of the quarter between the range provided
# Jan 1st 2022 falls on a saturday, the first business day is Monday the 3rd
pd.date_range('2021-02', '2022-01-31', freq='BQS-JAN')

DatetimeIndex(['2021-04-01', '2021-07-01', '2021-10-01', '2022-01-03'], dtype='datetime64[ns]', freq='BQS-JAN')

In [120]:
# returns the first day of the fiscal quarter (first quarter of the year is FEB) 
# between the range provided
pd.date_range('2021-02', '2022-01-31', freq='QS-FEB')

DatetimeIndex(['2021-02-01', '2021-05-01', '2021-08-01', '2021-11-01'], dtype='datetime64[ns]', freq='QS-FEB')

In [121]:
# returns the first *business* day of the fiscal quarter (first quarter of the year is FEB) 
# between the range provided
pd.date_range('2021-02', '2022-01-31', freq='BQS-FEB')

DatetimeIndex(['2021-02-01', '2021-05-03', '2021-08-02', '2021-11-01'], dtype='datetime64[ns]', freq='BQS-FEB')

## Offsets e.g. `MonthEnd()`

In [122]:
# M = Month End
pd.date_range('2021-02', '2021-05', freq='M')

DatetimeIndex(['2021-02-28', '2021-03-31', '2021-04-30'], dtype='datetime64[ns]', freq='M')

In [123]:
# MS = Month Start
pd.date_range('2021-02', '2021-05', freq='MS')

DatetimeIndex(['2021-02-01', '2021-03-01', '2021-04-01', '2021-05-01'], dtype='datetime64[ns]', freq='MS')

---

In [124]:
from pandas.tseries.offsets import Day, MonthBegin, MonthEnd, QuarterBegin, QuarterEnd

In [125]:
pd.date_range('2021-02', '2021-05', freq='M')

DatetimeIndex(['2021-02-28', '2021-03-31', '2021-04-30'], dtype='datetime64[ns]', freq='M')

In [126]:
# i would have expected no change if doing `+ MonthEnd()` to last day of the month but that isn't the case
pd.date_range('2021-02', '2021-05', freq='M') + MonthEnd()

DatetimeIndex(['2021-03-31', '2021-04-30', '2021-05-31'], dtype='datetime64[ns]', freq=None)

In [127]:
pd.date_range('2021-02', '2021-02', freq='MS').values

array(['2021-02-01T00:00:00.000000000'], dtype='datetime64[ns]')

`+ pd.offsets.MonthEnd(n=0) - pd.offsets.MonthBegin(n=1)` seems to get us back to the beginning of the month

`MonthBegin` and `MonthEnd` behave oddly when it is the first/last day of the month

In [160]:
pd.date_range('2021-02', '2021-05', freq='MS') + pd.offsets.MonthEnd(n=0) - pd.offsets.MonthBegin(n=1)

DatetimeIndex(['2021-02-01', '2021-03-01', '2021-04-01', '2021-05-01'], dtype='datetime64[ns]', freq=None)

In [161]:
pd.date_range('2021-02', '2021-05', freq='M') + pd.offsets.MonthEnd(n=0) - pd.offsets.MonthBegin(n=1)

DatetimeIndex(['2021-02-01', '2021-03-01', '2021-04-01'], dtype='datetime64[ns]', freq=None)

however, `rollback` and `rollforward`, below, seem to work as expected

---

In [138]:
pd.date_range('2021-02', '2021-05', freq='MS') + Day()

DatetimeIndex(['2021-02-02', '2021-03-02', '2021-04-02', '2021-05-02'], dtype='datetime64[ns]', freq=None)

In [139]:
pd.date_range('2021-02', '2021-05', freq='MS') + Day(2)

DatetimeIndex(['2021-02-03', '2021-03-03', '2021-04-03', '2021-05-03'], dtype='datetime64[ns]', freq=None)

### `rollforward()` & `rollback()`

In [140]:
from helpsk import date
import datetime

In [141]:
MonthBegin().rollback(parse('2021-01-01'))

Timestamp('2021-01-01 00:00:00')

In [142]:
MonthBegin().rollback(parse('2021-01-31'))

Timestamp('2021-01-01 00:00:00')

In [143]:
MonthEnd().rollforward(date.ymd('2021-01-01'))

Timestamp('2021-01-31 00:00:00')

In [144]:
MonthEnd().rollforward(date.ymd('2021-01-31'))

Timestamp('2021-01-31 00:00:00')

---

In [145]:
# default value for startingMonth doesn't make sense
QuarterBegin().rollback(date.ymd('2021-01-01'))

Timestamp('2020-12-01 00:00:00')

---

### Fiscal Quarter with `QuarterBegin(startingMonth=2)`

In [146]:
dates = np.append(pd.date_range('2021-01', '2021-12-31', freq='MS').values,
                  pd.date_range('2021-01', '2021-12-31', freq='M').values)
dates.sort()
dates

array(['2021-01-01T00:00:00.000000000', '2021-01-31T00:00:00.000000000',
       '2021-02-01T00:00:00.000000000', '2021-02-28T00:00:00.000000000',
       '2021-03-01T00:00:00.000000000', '2021-03-31T00:00:00.000000000',
       '2021-04-01T00:00:00.000000000', '2021-04-30T00:00:00.000000000',
       '2021-05-01T00:00:00.000000000', '2021-05-31T00:00:00.000000000',
       '2021-06-01T00:00:00.000000000', '2021-06-30T00:00:00.000000000',
       '2021-07-01T00:00:00.000000000', '2021-07-31T00:00:00.000000000',
       '2021-08-01T00:00:00.000000000', '2021-08-31T00:00:00.000000000',
       '2021-09-01T00:00:00.000000000', '2021-09-30T00:00:00.000000000',
       '2021-10-01T00:00:00.000000000', '2021-10-31T00:00:00.000000000',
       '2021-11-01T00:00:00.000000000', '2021-11-30T00:00:00.000000000',
       '2021-12-01T00:00:00.000000000', '2021-12-31T00:00:00.000000000'],
      dtype='datetime64[ns]')

In [147]:
[(date.to_string(pd.to_datetime(str(x))),
  QuarterBegin(startingMonth=1).rollback(x)) for x in dates]

[('2021-01-01', Timestamp('2021-01-01 00:00:00')),
 ('2021-01-31', Timestamp('2021-01-01 00:00:00')),
 ('2021-02-01', Timestamp('2021-01-01 00:00:00')),
 ('2021-02-28', Timestamp('2021-01-01 00:00:00')),
 ('2021-03-01', Timestamp('2021-01-01 00:00:00')),
 ('2021-03-31', Timestamp('2021-01-01 00:00:00')),
 ('2021-04-01', Timestamp('2021-04-01 00:00:00')),
 ('2021-04-30', Timestamp('2021-04-01 00:00:00')),
 ('2021-05-01', Timestamp('2021-04-01 00:00:00')),
 ('2021-05-31', Timestamp('2021-04-01 00:00:00')),
 ('2021-06-01', Timestamp('2021-04-01 00:00:00')),
 ('2021-06-30', Timestamp('2021-04-01 00:00:00')),
 ('2021-07-01', Timestamp('2021-07-01 00:00:00')),
 ('2021-07-31', Timestamp('2021-07-01 00:00:00')),
 ('2021-08-01', Timestamp('2021-07-01 00:00:00')),
 ('2021-08-31', Timestamp('2021-07-01 00:00:00')),
 ('2021-09-01', Timestamp('2021-07-01 00:00:00')),
 ('2021-09-30', Timestamp('2021-07-01 00:00:00')),
 ('2021-10-01', Timestamp('2021-10-01 00:00:00')),
 ('2021-10-31', Timestamp('2021

In [148]:
[(date.to_string(pd.to_datetime(str(x))),
  QuarterBegin(startingMonth=2).rollback(x)) for x in dates]

[('2021-01-01', Timestamp('2020-11-01 00:00:00')),
 ('2021-01-31', Timestamp('2020-11-01 00:00:00')),
 ('2021-02-01', Timestamp('2021-02-01 00:00:00')),
 ('2021-02-28', Timestamp('2021-02-01 00:00:00')),
 ('2021-03-01', Timestamp('2021-02-01 00:00:00')),
 ('2021-03-31', Timestamp('2021-02-01 00:00:00')),
 ('2021-04-01', Timestamp('2021-02-01 00:00:00')),
 ('2021-04-30', Timestamp('2021-02-01 00:00:00')),
 ('2021-05-01', Timestamp('2021-05-01 00:00:00')),
 ('2021-05-31', Timestamp('2021-05-01 00:00:00')),
 ('2021-06-01', Timestamp('2021-05-01 00:00:00')),
 ('2021-06-30', Timestamp('2021-05-01 00:00:00')),
 ('2021-07-01', Timestamp('2021-05-01 00:00:00')),
 ('2021-07-31', Timestamp('2021-05-01 00:00:00')),
 ('2021-08-01', Timestamp('2021-08-01 00:00:00')),
 ('2021-08-31', Timestamp('2021-08-01 00:00:00')),
 ('2021-09-01', Timestamp('2021-08-01 00:00:00')),
 ('2021-09-30', Timestamp('2021-08-01 00:00:00')),
 ('2021-10-01', Timestamp('2021-08-01 00:00:00')),
 ('2021-10-31', Timestamp('2021

### `groupby()` & `rollback()`

In [149]:
dates = pd.date_range('2021-01-01', '2021-12-31')
dates

DatetimeIndex(['2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04',
               '2021-01-05', '2021-01-06', '2021-01-07', '2021-01-08',
               '2021-01-09', '2021-01-10',
               ...
               '2021-12-22', '2021-12-23', '2021-12-24', '2021-12-25',
               '2021-12-26', '2021-12-27', '2021-12-28', '2021-12-29',
               '2021-12-30', '2021-12-31'],
              dtype='datetime64[ns]', length=365, freq='D')

In [150]:
df = pd.DataFrame({'value': range(0, len(dates))}, index=dates)
df

Unnamed: 0,value
2021-01-01,0
2021-01-02,1
2021-01-03,2
2021-01-04,3
2021-01-05,4
...,...
2021-12-27,360
2021-12-28,361
2021-12-29,362
2021-12-30,363


In [168]:
from pandas.tseries.offsets import QuarterBegin

In [169]:
df = pd.DataFrame({'date': dates, 'value': range(0, len(dates))})
df

Unnamed: 0,date,value
0,2021-01-01,0
1,2021-01-02,1
2,2021-01-03,2
3,2021-01-04,3
4,2021-01-05,4
...,...,...
360,2021-12-27,360
361,2021-12-28,361
362,2021-12-29,362
363,2021-12-30,363


In [172]:
df['quarter_start'] = df.date.apply(QuarterBegin(startingMonth=1).rollback)
df

Unnamed: 0,date,value,quarter_start
0,2021-01-01,0,2021-01-01
1,2021-01-02,1,2021-01-01
2,2021-01-03,2,2021-01-01
3,2021-01-04,3,2021-01-01
4,2021-01-05,4,2021-01-01
...,...,...,...
360,2021-12-27,360,2021-10-01
361,2021-12-28,361,2021-10-01
362,2021-12-29,362,2021-10-01
363,2021-12-30,363,2021-10-01


In [173]:
df.groupby('quarter_start').agg({'value': [min, max, len]})

Unnamed: 0_level_0,value,value,value
Unnamed: 0_level_1,min,max,len
quarter_start,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
2021-01-01,0,89,90
2021-04-01,90,180,91
2021-07-01,181,272,92
2021-10-01,273,364,92


Passing in `QuarterBegin(startingMonth=1).rollback` directly doesn't seem to work as I would have thought; I would have guessed it was the same as above.

In [175]:
df.groupby(QuarterBegin(startingMonth=1).rollback).agg({'value': [min, max, len]})

Unnamed: 0_level_0,value,value,value
Unnamed: 0_level_1,min,max,len
1970-01-01 00:00:00.000000000,0,0,1
1970-01-01 00:00:00.000000001,1,1,1
1970-01-01 00:00:00.000000002,2,2,1
1970-01-01 00:00:00.000000003,3,3,1
1970-01-01 00:00:00.000000004,4,4,1
...,...,...,...
1970-01-01 00:00:00.000000360,360,360,1
1970-01-01 00:00:00.000000361,361,361,1
1970-01-01 00:00:00.000000362,362,362,1
1970-01-01 00:00:00.000000363,363,363,1


# Unexpected Behavior

## Default Arguments

Python’s default arguments are evaluated once when the function is defined, not each time the function is called (like it is in say, Ruby). This means that if you use a mutable default argument and mutate it, you will and have mutated that object for all future calls to the function as well.
	* http://docs.python-guide.org/en/latest/writing/gotchas/
	* This means you cannot assign objects as default arguments

## Operations in general exclude missing data.

https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html#stats

In [156]:
df = pd.DataFrame(np.random.randn(6, 4),
                  columns=list("ABCD"))
df.iat[0, 0] = np.nan
df

Unnamed: 0,A,B,C,D
0,,2.871469,2.184371,0.194326
1,0.30133,-0.546467,-1.09857,-0.76829
2,-1.062999,2.656968,1.372673,1.340727
3,-0.775027,2.547829,-0.647637,-0.100684
4,1.299102,0.920292,-1.583688,0.972841
5,1.102798,0.968852,-0.435754,-0.001409


In [157]:
df.mean()  # this works; R equivalent would have returned `NA`

A    0.173041
B    1.569824
C   -0.034768
D    0.272918
dtype: float64

In [158]:
df.mean(skipna=False)  # have to manually set skipna=False to get desired behavior

A         NaN
B    1.569824
C   -0.034768
D    0.272918
dtype: float64

---