# 5 New Features in pandas 1.0 You Should Know About

## Setup

In [1]:
import os
import platform
import random
from platform import python_version

import jupyterlab
import numpy as np
import pandas as pd

print("System")
print("os name: %s" % os.name)
print("system: %s" % platform.system())
print("release: %s" % platform.release())
print()
print("Python")
print("version: %s" % python_version())
print()
print("Python Packages")
print("jupterlab==%s" % jupyterlab.__version__)
print("pandas==%s" % pd.__version__)
print("numpy==%s" % np.__version__)



System
os name: posix
system: Darwin
release: 19.2.0

Python
version: 3.8.0

Python Packages
jupterlab==1.2.4
pandas==1.0.0
numpy==1.18.0


In [2]:
df = pd.DataFrame({'col1': [1, 2, 3, 10, 2, 3, 11, 2, 3, 12, 1, 2]})
df

Unnamed: 0,col1
0,1
1,2
2,3
3,10
4,2
5,3
6,11
7,2
8,3
9,12


In [3]:
use_expanding =  (df.col1 >= 10).tolist()
use_expanding

[False,
 False,
 False,
 True,
 False,
 False,
 True,
 False,
 False,
 True,
 False,
 False]

In [8]:
from pandas.api.indexers import BaseIndexer

class CustomIndexer(BaseIndexer):

    def get_window_bounds(self, num_values, min_periods, center, closed):
        start = np.empty(num_values, dtype=np.int64)
        end = np.empty(num_values, dtype=np.int64)
        start_i = 0
        for i in range(num_values):
            if self.use_expanding[i]:
                start[i] = start_i
                start_i = end[i] = i + 1
            else:
                start[i] = start_i
                end[i] = i + self.window_size
        print('start', start)
        print('end', end)
        return start, end


indexer = CustomIndexer(window_size=1, use_expanding=use_expanding)

In [50]:
df.rolling(indexer).mean()

start [0 0 0 0 4 4 4 7]
end [1 2 3 4 5 6 7 8]


Unnamed: 0,col1
0,1.0
1,2.0
2,3.0
3,2.75
4,3.0
5,5.0
6,3.666667
7,2.0


In [12]:
df = pd.DataFrame({"col1": pd.Series(range(1_000_000))})
df.head()

Unnamed: 0,col1
0,0
1,1
2,2
3,3
4,4


In [13]:
def some_function(x):
    return np.sum(x) + 5

In [14]:
%%timeit

df.col1.rolling(100).apply(some_function, engine='cython', raw=True)

4.03 s ± 76.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [16]:
%%timeit

df.col1.rolling(100).apply(some_function, engine='numba', raw=True)

500 ms ± 11.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [20]:
s = pd.Series([3, 6, 9, None], dtype="Int64")
s

0       3
1       6
2       9
3    <NA>
dtype: Int64

In [21]:
s.loc[3] == s.loc[3]

<NA>

In [22]:
np.nan == np.nan

False

In [27]:
s = pd.Series(['an', 'ban', 'pet', 'podgan', None])
s

0        an
1       ban
2       pet
3    podgan
4      None
dtype: object

In [51]:
s = pd.Series(['an', 'ban', 5, 'pet', 5.0, 'podgan', None])
s

0        an
1       ban
2         5
3       pet
4         5
5    podgan
6      None
dtype: object

In [52]:
s = pd.Series(['an', 'ban', 'pet', 'podgan', None], dtype='string')
s

0        an
1       ban
2       pet
3    podgan
4      <NA>
dtype: string

In [47]:
df = pd.DataFrame({"col1": [1, 3, 5, 2, 3, 7, 1, 2]})

In [48]:
df.sort_values('col1')

Unnamed: 0,col1
0,1
6,1
3,2
7,2
1,3
4,3
2,5
5,7


In [49]:
df.sort_values('col1', ignore_index=True)

Unnamed: 0,col1
0,1
1,1
2,2
3,2
4,3
5,3
6,5
7,7
