In [9]:
!python --version

Python 3.9.5


In [2]:
!pip install helpsk --upgrade



In [3]:
import pandas as pd
from helpsk import string

In [6]:
string.collapse('a', 'b', 'c')

'abc'

In [8]:
string.collapse('a', 'b', 'c', surround="'", separate=",")

"'a','b','c'"

In [3]:
from helpsk import validation as vld
vld.assert_true(True)
vld.assert_false(False)

In [4]:
from helpsk import ExampleClass
example = ExampleClass()
example.my_variable

getter of my_variable called


0

# `is` vs `==`

`==` is an equality test. It checks whether the right hand side and the left hand side are equal objects (according to their __eq__ or __cmp__ methods.)

is is an identity test. It checks whether the right hand side and the left hand side are the very same object. No methodcalls are done, objects can't influence the is operation.

You use `is` (and `is not`) for `singletons`, like `None`, where you don't care about objects that might want to pretend to be None or where you want to protect against objects breaking when being compared against None.

# List & DataFrame Equality

In [4]:
[1, 2, 3] == [1, 2, 3]

True

In [5]:
[1, 2, 3] == [1, 2, 3.00001]

False

In [6]:
['a', 'b', 'c'] == ['a', 'b', 'c']

True

In [7]:
['a', 'b', None] == ['a', 'b', None]

True

In [8]:
['a', 'b', 'c'] == ['a', 'b', None]

False

In [10]:
pd.DataFrame([[1, 2], [2, 3]]).equals(pd.DataFrame([[1, 2], [2, 3]]))

True

In [12]:
pd.DataFrame([[1, 2], [2, 3]]).equals(pd.DataFrame([[1, 2], [2, 3.001]]))

False

In [13]:
pd.DataFrame([[1, None], [2, 3.001]]).equals(pd.DataFrame([[1, None], [2, 3.001]]))

True

# Checking for Empty Containers

In [9]:
a = []

len(a) == 0  # not Pythonic

True

In [10]:
not a  # Pythonic

True

In [11]:
if len(a) == 0:
    print('Not Pythonic')

if not a:
    print('Pythonic')


Not Pythonic
Pythonic


In [12]:
a = ['a']

if len(a) != 0:
    print('Not Pythonic')

if a:
    print('Pythonic')

Not Pythonic
Pythonic


# Multi-lines

In [13]:
# not Pythonic
a = 1 + \
    2
print(a)

a = (1 +  # Pythonic
     2)
print(a)

3
3


# Classes

## Private vs Protected

In [32]:
class Example:
    def __init__(self):
        self.__private_variable = 'private'
        self._protected_variable = 'protected'
    
    def instance_method(self):
        return 'instance method ' + self.__private_variable + self._protected_variable
    
    @staticmethod
    def static_method(x):
        return 'static method ' + x
    
    @classmethod
    def class_method(cls, x):
        return 'class method ' + x

In [33]:
ex = Example()
ex._protected_variable

'protected'

In [34]:
try:
  print(ex.__private_variable)
except Exception as e:
  print(e)

'Example' object has no attribute '__private_variable'


In [35]:
ex._Example__private_variable

'private'

In [36]:
ex.instance_method()

'instance method privateprotected'

In [37]:
ex.static_method('a')  # not sure how this is useful

'static method a'

In [38]:
Example.class_method('a')  # no instance

'class method a'

## Getters / Setters

In [4]:
class ExampleClass:

    def __init__(self, my_variable=0):
        self._my_variable = my_variable

    @property
    def my_variable(self):
        """I'm the 'my_variable' property."""
        print("getter of my_variable called")
        return self._my_variable

    @my_variable.setter
    def my_variable(self, value):
        print("setter of my_variable called")
        self._my_variable = value

In [5]:
example = ExampleClass()
example.my_variable == 0

getter of my_variable called


True

In [6]:
example.my_variable = 0

setter of my_variable called


# Default Arguments

Python’s default arguments are evaluated once when the function is defined, not each time the function is called (like it is in say, Ruby). This means that if you use a mutable default argument and mutate it, you will and have mutated that object for all future calls to the function as well.
	* http://docs.python-guide.org/en/latest/writing/gotchas/
	* This means you cannot assign objects as default arguments

# Misc

# Pandas

In [175]:
import numpy as np
import pandas as pd

dates = pd.date_range("20210809", periods=6)
df = pd.DataFrame(np.random.randn(6, 4),
                  index=dates,
                  columns=list("ABCD"))

In [176]:
dates

DatetimeIndex(['2021-08-09', '2021-08-10', '2021-08-11', '2021-08-12',
               '2021-08-13', '2021-08-14'],
              dtype='datetime64[ns]', freq='D')

In [177]:
df

Unnamed: 0,A,B,C,D
2021-08-09,-0.186162,0.00166,0.013101,-0.659372
2021-08-10,-1.105882,1.207941,2.5266,-0.698492
2021-08-11,-0.680871,-1.4493,0.028325,-0.940038
2021-08-12,-1.22481,-0.038875,-1.025572,-0.601159
2021-08-13,-0.141608,1.30239,-0.210668,1.443507
2021-08-14,0.493285,-0.374523,1.348035,1.654523


## Copy by Reference

In [179]:
df_2 = df

In [180]:
df.iat[0, 0] = np.nan

In [181]:
df.iloc[0, :]

A         NaN
B    0.001660
C    0.013101
D   -0.659372
Name: 2021-08-09 00:00:00, dtype: float64

In [182]:
df_2.iloc[0, :]

A         NaN
B    0.001660
C    0.013101
D   -0.659372
Name: 2021-08-09 00:00:00, dtype: float64

### Use `.copy()` for deep copy

In [196]:
df_2 = df.copy()

In [197]:
df.iat[0, 1] = np.nan

In [198]:
df.iloc[0, :]

A         NaN
B         NaN
C    0.013101
D   -0.659372
Name: 2021-08-09 00:00:00, dtype: float64

In [199]:
df_2.iloc[0, :]

A         NaN
B    0.001660
C    0.013101
D   -0.659372
Name: 2021-08-09 00:00:00, dtype: float64

## Selection

In [183]:
df.A

2021-08-09         NaN
2021-08-10   -1.105882
2021-08-11   -0.680871
2021-08-12   -1.224810
2021-08-13   -0.141608
2021-08-14    0.493285
Freq: D, Name: A, dtype: float64

In [184]:
df['A']  # all rows of single column

2021-08-09         NaN
2021-08-10   -1.105882
2021-08-11   -0.680871
2021-08-12   -1.224810
2021-08-13   -0.141608
2021-08-14    0.493285
Freq: D, Name: A, dtype: float64

In [185]:
df[0:1]  # slices rows; excludes last index

Unnamed: 0,A,B,C,D
2021-08-09,,0.00166,0.013101,-0.659372


In [186]:
df[['A']]

Unnamed: 0,A
2021-08-09,
2021-08-10,-1.105882
2021-08-11,-0.680871
2021-08-12,-1.22481
2021-08-13,-0.141608
2021-08-14,0.493285


If passing a single value to `.loc`, a Series is returned; if passing a list to `.loc` a DataFrame is returned

In [187]:
df.loc[pd.to_datetime('2021-08-09')]

A         NaN
B    0.001660
C    0.013101
D   -0.659372
Name: 2021-08-09 00:00:00, dtype: float64

In [188]:
df.loc[[pd.to_datetime('2021-08-09')]]

Unnamed: 0,A,B,C,D
2021-08-09,,0.00166,0.013101,-0.659372


In [189]:
df.loc[[pd.to_datetime('2021-08-09'), pd.to_datetime('2021-08-10')]]

Unnamed: 0,A,B,C,D
2021-08-09,,0.00166,0.013101,-0.659372
2021-08-10,-1.105882,1.207941,2.5266,-0.698492


### `at` vs `loc`

- `df.at` is faster
- `df.at` can only access a single value at a time.
- `df.loc` can select multiple rows and/or columns.


https://stackoverflow.com/questions/37216485/pandas-at-versus-loc

In [190]:
import pandas as pd
data = pd.DataFrame({
    'col1':['a','a','a','a','a','b','b','b','b','b'],
    'col2':[1,2,3,4,5,6,7,8,9,0],
    'col3':[-1,-2,-3,-4,-5,-6,-7,-8,-9,0]
})

In [191]:
data.groupby('col1').agg({'col2': 'max', 'col3': 'min'})

Unnamed: 0_level_0,col2,col3
col1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,5,-5
b,9,-9



https://stackoverflow.com/questions/10951341/pandas-dataframe-aggregate-function-using-multiple-columns

In [192]:
data

Unnamed: 0,col1,col2,col3
0,a,1,-1
1,a,2,-2
2,a,3,-3
3,a,4,-4
4,a,5,-5
5,b,6,-6
6,b,7,-7
7,b,8,-8
8,b,9,-9
9,b,0,0


```
data %>%
    group_by(col1) %>%
    summarise(result_a = sum(col2 * col3),
              result_b = sum(col2) * sum(col3))
```

In [193]:
grouped = data.groupby('col1')

In [194]:
grouped = data.groupby('col1')

def my_function(group):
    col2 = group['col2']
    col3 = group['col3']
    return (col2 * col3).sum()

result = grouped.apply(my_function)
result

col1
a    -55
b   -230
dtype: int64

In [115]:
result

array([ -55, -230])

In [123]:
result['b']

-230

In [None]:
grouped = data.groupby('col1')

def my_function(group):
    
    return pd.DataFrame({'original': group,
                         'demeaned': group - group.mean()})
    
    col2 = group['col2']
    col3 = group['col3']
    return (col2 * col3).sum()

result = grouped.apply(my_function)
result

```
data %>%
    group_by(col1) %>%
    summarise(result_a = sum(col2 * col3),
              result_b = sum(col2) * sum(col3))
```


https://stackoverflow.com/questions/14529838/apply-multiple-functions-to-multiple-groupby-columns

In [132]:
data

Unnamed: 0,col1,col2,col3
0,a,1,-1
1,a,2,-2
2,a,3,-3
3,a,4,-4
4,a,5,-5
5,b,6,-6
6,b,7,-7
7,b,8,-8
8,b,9,-9
9,b,0,0


In [140]:
def f(x):
    d = {}
    d['result_a'] = (x['col2'] * x['col3']).sum()
    d['result_b'] = x['col2'].sum() * x['col3'].sum()
    d['result_c'] = x['col2'] + x['col3']
    d['result_d'] = (x['col2'] + x['col3']).sum()
    d['count'] = x.shape[0]
    d['col2_max'] = x['col2'].max()
    d['col3_max'] = x['col3'].max()
    return pd.Series(d)

data.groupby('col1').apply(f)

Unnamed: 0_level_0,result_a,result_b,result_c,result_d,count,col2_max,col3_max
col1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
a,-55,-225,5 0 6 0 7 0 8 0 9 0 dtype: int64,0,5,5,-1
b,-230,-900,5 0 6 0 7 0 8 0 9 0 dtype: int64,0,5,9,0


https://gist.github.com/conormm/fd8b1980c28dd21cfaf6975c86c74d07


```
df %>% group_by(group) %>% mutate(mean_var1 = mean(var1))  # R
df.groupby('group').assign(mean_var1 = lambda x: np.mean(x.var1)  # Python

??? df['mean_var1'] = df.groupby('group').pipe(lambda x: x.var1.transform('mean')) #https://gist.github.com/conormm/fd8b1980c28dd21cfaf6975c86c74d07
```

# Unexpected Behavior

## Operations in general exclude missing data.

https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html#stats

In [209]:
df = pd.DataFrame(np.random.randn(6, 4),
                  columns=list("ABCD"))
df.iat[0, 0] = np.nan
df

Unnamed: 0,A,B,C,D
0,,-1.343106,0.025455,1.477582
1,1.626082,0.544241,0.620942,-0.736149
2,-1.478041,0.230491,0.426075,0.83451
3,-0.301862,-0.107032,1.058098,0.710978
4,-0.903349,1.558083,1.192531,-1.554701
5,-0.325039,0.857985,-0.200452,-0.66191


In [210]:
df.A.mean()  # this works; R equivalent would have returned `NA`

-0.27644178620069293

In [211]:
df.A.mean(skipna=False)  # have to manually set skipna=False to get desired behavior

nan

---