# Pandas

In [2]:
import numpy as np
import pandas as pd

dates = pd.date_range("20210809", periods=6)
df = pd.DataFrame(np.random.randn(6, 4),
                  index=dates,
                  columns=list("ABCD"))

In [3]:
dates

DatetimeIndex(['2021-08-09', '2021-08-10', '2021-08-11', '2021-08-12',
               '2021-08-13', '2021-08-14'],
              dtype='datetime64[ns]', freq='D')

In [4]:
df

Unnamed: 0,A,B,C,D
2021-08-09,1.285066,-0.369866,0.808361,-0.58412
2021-08-10,-1.306844,0.05651,1.606457,-0.250813
2021-08-11,-0.504009,-0.602065,1.412801,-0.298434
2021-08-12,0.302428,-0.612931,0.076053,-0.826854
2021-08-13,0.26543,1.883569,0.099707,-1.71264
2021-08-14,-0.13263,-1.359464,0.245393,0.612651


## Copy by Reference

In [5]:
df_2 = df

In [6]:
df.iat[0, 0] = np.nan

In [7]:
df.iloc[0, :]

A         NaN
B   -0.369866
C    0.808361
D   -0.584120
Name: 2021-08-09 00:00:00, dtype: float64

In [8]:
df_2.iloc[0, :]

A         NaN
B   -0.369866
C    0.808361
D   -0.584120
Name: 2021-08-09 00:00:00, dtype: float64

### Use `.copy()` for deep copy

In [9]:
df_2 = df.copy()

In [10]:
df.iat[0, 1] = np.nan

In [11]:
df.iloc[0, :]

A         NaN
B         NaN
C    0.808361
D   -0.584120
Name: 2021-08-09 00:00:00, dtype: float64

In [12]:
df_2.iloc[0, :]

A         NaN
B   -0.369866
C    0.808361
D   -0.584120
Name: 2021-08-09 00:00:00, dtype: float64

## Selection

In [13]:
df.A

2021-08-09         NaN
2021-08-10   -1.306844
2021-08-11   -0.504009
2021-08-12    0.302428
2021-08-13    0.265430
2021-08-14   -0.132630
Freq: D, Name: A, dtype: float64

In [14]:
df['A']  # all rows of single column

2021-08-09         NaN
2021-08-10   -1.306844
2021-08-11   -0.504009
2021-08-12    0.302428
2021-08-13    0.265430
2021-08-14   -0.132630
Freq: D, Name: A, dtype: float64

In [15]:
df[0:1]  # slices rows; excludes last index

Unnamed: 0,A,B,C,D
2021-08-09,,,0.808361,-0.58412


In [16]:
df[['A']]

Unnamed: 0,A
2021-08-09,
2021-08-10,-1.306844
2021-08-11,-0.504009
2021-08-12,0.302428
2021-08-13,0.26543
2021-08-14,-0.13263


If passing a single value to `.loc`, a Series is returned; if passing a list to `.loc` a DataFrame is returned

In [17]:
df.loc[pd.to_datetime('2021-08-09')]

A         NaN
B         NaN
C    0.808361
D   -0.584120
Name: 2021-08-09 00:00:00, dtype: float64

In [18]:
df.loc[[pd.to_datetime('2021-08-09')]]

Unnamed: 0,A,B,C,D
2021-08-09,,,0.808361,-0.58412


In [19]:
df.loc[[pd.to_datetime('2021-08-09'), pd.to_datetime('2021-08-10')]]

Unnamed: 0,A,B,C,D
2021-08-09,,,0.808361,-0.58412
2021-08-10,-1.306844,0.05651,1.606457,-0.250813


### `at` vs `loc`

- `df.at` is faster
- `df.at` can only access a single value at a time.
- `df.loc` can select multiple rows and/or columns.


https://stackoverflow.com/questions/37216485/pandas-at-versus-loc

### select_dtypes

In [20]:
df = pd.DataFrame({
    'A': np.random.randn(4),
    'B': np.random.randn(4),
    'C': ['some', 'string', 'values', 'bla'],
    'D': pd.date_range("2021-08-09", periods=4)
})

In [21]:
df.select_dtypes(include=[np.number])

Unnamed: 0,A,B
0,-0.08005,1.113943
1,1.194157,0.169055
2,0.62214,1.650759
3,0.319104,1.699116


In [22]:
df.select_dtypes(exclude=[np.number])

Unnamed: 0,C,D
0,some,2021-08-09
1,string,2021-08-10
2,values,2021-08-11
3,bla,2021-08-12


## Get

method that allows you to index on a collection and return default value if the index is out of range

### Series

In [23]:
dates[0]

Timestamp('2021-08-09 00:00:00', freq='D')

In [24]:
df.D.get(dates[0])

In [25]:
df.D.get(pd.date_range('2021-01-01', periods=1)[0]) is None

True

In [26]:
df.D.get(pd.date_range('2021-01-01', periods=1)[0], np.NaN)

nan

## Vectorized Operations - Automatic Alignment

A key difference between Series and ndarray is that operations between Series automatically align the data based on label. Thus, you can write computations without giving consideration to whether the Series involved have the same labels.

https://pandas.pydata.org/pandas-docs/stable/user_guide/dsintro.html#vectorized-operations-and-label-alignment-with-series

In [27]:
s1 = pd.Series([1, 2, 3, 4, 5], index=['A', 'B', 'C', 'D', 'E'])
s2 = pd.Series([10, 20, 30, 40, 50], index=['A', 'B', 'C', 'D', 'E'])

s1 + s2

A    11
B    22
C    33
D    44
E    55
dtype: int64

In [28]:
print(s1.iloc[[4, 3, 2, 1, 0]])
s1.iloc[[4, 3, 2, 1, 0]] + s2

E    5
D    4
C    3
B    2
A    1
dtype: int64


A    11
B    22
C    33
D    44
E    55
dtype: int64

The result of an operation between unaligned Series will have the union of the indexes involved. If a label is not found in one Series or the other, the result will be marked as missing NaN. 

In [29]:
s1.iloc[[2, 3, 4]] + s2.iloc[[0, 1, 2, 3]]

A     NaN
B     NaN
C    33.0
D    44.0
E     NaN
dtype: float64

## Applying Functions

https://pandas.pydata.org/pandas-docs/stable/user_guide/basics.html#function-application

### Tablewise function application - Chaining Function Calls with `pipe()`

In [30]:
df_p = pd.DataFrame({"city_and_code": ["Chicago, IL", "Seattle, WA"]})
df_p

Unnamed: 0,city_and_code
0,"Chicago, IL"
1,"Seattle, WA"


In [31]:
df_p["city_and_code"].str.split(",")

0    [Chicago,  IL]
1    [Seattle,  WA]
Name: city_and_code, dtype: object

In [32]:
df_p["city_and_code"].str.split(",").str.get(1)

0     IL
1     WA
Name: city_and_code, dtype: object

In [33]:
def extract_city_name(df):
    """
    Chicago, IL -> Chicago for city_name column
    """
    df["city_name"] = df["city_and_code"].str.split(",").str.get(0)
    return df

def add_country_name(df, country_name=None):
    """
    Chicago -> Chicago-US for city_name column
    """
    col = "city_name"
    df["city_and_country"] = df[col] + country_name
    return df

In [34]:
df_p.pipe(extract_city_name).pipe(add_country_name, country_name="US")

Unnamed: 0,city_and_code,city_name,city_and_country
0,"Chicago, IL",Chicago,ChicagoUS
1,"Seattle, WA",Seattle,SeattleUS


Python is pass by reference, so, unlike R, the DataFrame changes without having to make final assignment back into `df_p`. Instaed of returning the DataFrame object we are returning a pointer do the object.

In [35]:
df_p

Unnamed: 0,city_and_code,city_name,city_and_country
0,"Chicago, IL",Chicago,ChicagoUS
1,"Seattle, WA",Seattle,SeattleUS


### Row/Column wise function application

https://pandas.pydata.org/pandas-docs/stable/user_guide/basics.html#tablewise-function-application

Arbitrary functions can be applied along the axes of a DataFrame using the apply() method, which, like the descriptive statistics methods, takes an optional axis argument:

In [42]:
df = pd.DataFrame({
    'A': np.random.randn(4),
    'B': np.random.randn(4),
})

In [43]:
# axis 0 indicates column-wise operations (across/along the rows, for each column)
df.apply(lambda x: x.mean(), axis=0)

A    0.443980
B   -0.740741
dtype: float64

In [44]:
# axis 1 indicates row-wise operations (across/along the columns, for each row)
df.apply(lambda x: x.mean(), axis=1)

0    1.050949
1   -0.880764
2   -0.840174
3    0.076467
dtype: float64

## Pandas Aggregations

In [45]:
import pandas as pd
import numpy as np

data = pd.DataFrame({
    'col1':['a','a','a','a','a','b','b','b','b','b'],
    'col2':[10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
    'col3':[100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
})

In [46]:
data

Unnamed: 0,col1,col2,col3
0,a,10,100
1,a,20,200
2,a,30,300
3,a,40,400
4,a,50,500
5,b,60,600
6,b,70,700
7,b,80,800
8,b,90,900
9,b,100,1000


### Use `.agg()` for simple functions on single columns

Meaning, there isn't any interaction between columns. (i.e. can't do `col2 + col3`)

In [47]:
data.groupby('col1').agg({'col2': [min, max], 'col3': [min, np.median, max]})

Unnamed: 0_level_0,col2,col2,col3,col3,col3
Unnamed: 0_level_1,min,max,min,median,max
col1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
a,10,50,100,300.0,500
b,60,100,600,800.0,1000


### Use .apply() to create interaction among multiple columns

https://stackoverflow.com/questions/10951341/pandas-dataframe-aggregate-function-using-multiple-columns

In [48]:
data

Unnamed: 0,col1,col2,col3
0,a,10,100
1,a,20,200
2,a,30,300
3,a,40,400
4,a,50,500
5,b,60,600
6,b,70,700
7,b,80,800
8,b,90,900
9,b,100,1000


R Equivalent (although not quite because we don't get to name the column; returning column is `col1`, not `result_a`):

```
data %>%
    group_by(col1) %>%
    summarise(result_a = sum(col2 * col3))
```

In [49]:
grouped = data.groupby('col1')

def my_function(group):
    col2 = group['col2']
    col3 = group['col3']
    return (col2 * col3).sum()

result = grouped.apply(my_function)
result

col1
a     55000
b    330000
dtype: int64

In [50]:
result['b']

330000

---

Let's remove the .sum() and just multiply across groups, to see what the result is.

In [51]:
grouped = data.groupby('col1')

def my_function(group):
    col2 = group['col2']
    col3 = group['col3']
    return (col2 * col3)#.sum()

result = grouped.apply(my_function)
result

col1   
a     0      1000
      1      4000
      2      9000
      3     16000
      4     25000
b     5     36000
      6     49000
      7     64000
      8     81000
      9    100000
dtype: int64

---

### `group_by(...) %>% summarise(...)`

The following examples shows the equivalent of R's `group_by(...) %>% summarise(...)`

https://stackoverflow.com/questions/14529838/apply-multiple-functions-to-multiple-groupby-columns

Unlike the last examples, we can A) name the columns and B) interact the grouped columns.

In [52]:
data

Unnamed: 0,col1,col2,col3
0,a,10,100
1,a,20,200
2,a,30,300
3,a,40,400
4,a,50,500
5,b,60,600
6,b,70,700
7,b,80,800
8,b,90,900
9,b,100,1000


In [53]:
def f(x):
    d = {}
    d['result_a'] = (x['col2'] * x['col3']).sum()
    d['result_b'] = x['col2'].sum() * x['col3'].sum()
    d['result_c'] = (x['col2'] + x['col3']).values
    d['count'] = x.shape[0]
    d['col2_max'] = x['col2'].max()
    d['col3_max'] = x['col3'].max()
    return pd.Series(d)

data.groupby('col1').apply(f)

Unnamed: 0_level_0,result_a,result_b,result_c,count,col2_max,col3_max
col1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
a,55000,225000,"[110, 220, 330, 440, 550]",5,50,500
b,330000,1600000,"[660, 770, 880, 990, 1100]",5,100,1000


which is equivalent to:

```
data %>%
    group_by(col1) %>%
    summarise(result_a = sum(col2 * col3),
              result_b = sum(col2) * sum(col3),
              ...)
```

### `group_by(...) %>% mutate(...)`

https://gist.github.com/conormm/fd8b1980c28dd21cfaf6975c86c74d07


```
df %>% group_by(group) %>% mutate(mean_var1 = mean(var1))  # R
```

In [54]:
df = pd.DataFrame(data = {'group': ['A', 'A', 'B', 'B'],
                          'var1': [1, 2, 3, 4],
                          'var2': [40, 30, 20, 10]})
df

Unnamed: 0,group,var1,var2
0,A,1,40
1,A,2,30
2,B,3,20
3,B,4,10


In [55]:
df.assign(mean_var1 = lambda x: x.groupby('group')['var1'].transform('mean'))

Unnamed: 0,group,var1,var2,mean_var1
0,A,1,40,1.5
1,A,2,30,1.5
2,B,3,20,3.5
3,B,4,10,3.5


In [56]:
# here is what the inside of the lambda function above gives without .assign()
df.groupby('group')['var1'].transform('mean')

0    1.5
1    1.5
2    3.5
3    3.5
Name: var1, dtype: float64

TODO: show more advanced example of above with column interactions

In [57]:
df.groupby('group').transform('mean')

Unnamed: 0,var1,var2
0,1.5,35.0
1,1.5,35.0
2,3.5,15.0
3,3.5,15.0


---

In [58]:
df.var1.rank(ascending=False)

0    4.0
1    3.0
2    2.0
3    1.0
Name: var1, dtype: float64

In [59]:
df.groupby('group')['var1'].transform(lambda x: x.rank(ascending=False))

0    2.0
1    1.0
2    2.0
3    1.0
Name: var1, dtype: float64

In [60]:


df.groupby('group').transform(lambda x: x.rank(ascending=False))

Unnamed: 0,var1,var2
0,2.0,1.0
1,1.0,2.0
2,2.0,1.0
3,1.0,2.0


### `pivot_wider()`

#### via `pivot()`

https://stackoverflow.com/questions/40229444/trouble-pivoting-in-pandas-spread-in-r

In [61]:
df = pd.DataFrame({'site_id': {0: 'a', 1: 'a', 2: 'b', 3: 'b', 4: 'c', 5: 'c',6: 'a', 7: 'a', 8: 'b', 9: 'b', 10: 'c', 11: 'c'},
                   'dt': {0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1,6: 2, 7: 2, 8: 2, 9: 2, 10: 2, 11: 2},
                   'eu': {0: 'FGE', 1: 'WSH', 2: 'FGE', 3: 'WSH', 4: 'FGE', 5: 'WSH',6: 'FGE', 7: 'WSH', 8: 'FGE', 9: 'WSH', 10: 'FGE', 11: 'WSH'},
                   'kw': {0: '8', 1: '5', 2: '3', 3: '7', 4: '1', 5: '5',6: '2', 7: '3', 8: '5', 9: '7', 10: '2', 11: '5'}})
df

Unnamed: 0,site_id,dt,eu,kw
0,a,1,FGE,8
1,a,1,WSH,5
2,b,1,FGE,3
3,b,1,WSH,7
4,c,1,FGE,1
5,c,1,WSH,5
6,a,2,FGE,2
7,a,2,WSH,3
8,b,2,FGE,5
9,b,2,WSH,7


In [62]:
# notice `dt` is first so it is the first level index
df.pivot(index = ['dt', 'site_id'], values = 'kw', columns = 'eu')

Unnamed: 0_level_0,eu,FGE,WSH
dt,site_id,Unnamed: 2_level_1,Unnamed: 3_level_1
1,a,8,5
1,b,3,7
1,c,1,5
2,a,2,3
2,b,5,7
2,c,2,5


---

#### via `set_index()` & 'unstack()`

In [63]:
df.set_index(['dt','site_id','eu']).unstack('eu')

Unnamed: 0_level_0,Unnamed: 1_level_0,kw,kw
Unnamed: 0_level_1,eu,FGE,WSH
dt,site_id,Unnamed: 2_level_2,Unnamed: 3_level_2
1,a,8,5
1,b,3,7
1,c,1,5
2,a,2,3
2,b,5,7
2,c,2,5


---

equivalent to R's `pivot_longer(-site_id)`

In [64]:
df.set_index(['dt','site_id','eu']).unstack('site_id')

Unnamed: 0_level_0,Unnamed: 1_level_0,kw,kw,kw
Unnamed: 0_level_1,site_id,a,b,c
dt,eu,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,FGE,8,3,1
1,WSH,5,7,5
2,FGE,2,5,2
2,WSH,3,7,5


---

#### from wide to long and back

In [65]:
df_wider = df.set_index(['dt','site_id','eu']).unstack('site_id')
df_wider

Unnamed: 0_level_0,Unnamed: 1_level_0,kw,kw,kw
Unnamed: 0_level_1,site_id,a,b,c
dt,eu,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,FGE,8,3,1
1,WSH,5,7,5
2,FGE,2,5,2
2,WSH,3,7,5


In [66]:
# back to longer
df_wider.stack()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,kw
dt,eu,site_id,Unnamed: 3_level_1
1,FGE,a,8
1,FGE,b,3
1,FGE,c,1
1,WSH,a,5
1,WSH,b,7
1,WSH,c,5
2,FGE,a,2
2,FGE,b,5
2,FGE,c,2
2,WSH,a,3


In [67]:
# multi-index select
df_wider.loc[:, ['kw']]

Unnamed: 0_level_0,Unnamed: 1_level_0,kw,kw,kw
Unnamed: 0_level_1,site_id,a,b,c
dt,eu,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,FGE,8,3,1
1,WSH,5,7,5
2,FGE,2,5,2
2,WSH,3,7,5


In [68]:
# multi-index select; pass tuple
df_wider.loc[([1, 2], 'FGE'), ('kw', ['a', 'c'])]

Unnamed: 0_level_0,Unnamed: 1_level_0,kw,kw
Unnamed: 0_level_1,site_id,a,c
dt,eu,Unnamed: 2_level_2,Unnamed: 3_level_2
1,FGE,8,1
2,FGE,2,2


### `pivot_longer()`

#### `melt()`

In [74]:
df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'},
                   'B': {0: 1, 1: 3, 2: 5},
                   'C': {0: 2, 1: 4, 2: 6}})
df

Unnamed: 0,A,B,C
0,a,1,2
1,b,3,4
2,c,5,6


In [75]:
pd.melt(df, id_vars=['A'], value_vars=['B', 'C'])

Unnamed: 0,A,variable,value
0,a,B,1
1,b,B,3
2,c,B,5
3,a,C,2
4,b,C,4
5,c,C,6


In [93]:
melted = pd.melt(df, id_vars=['A'], value_vars=['B', 'C'])
unmelted = melted.pivot_table(values = 'value', index=['A'], columns=['variable'])
unmelted.reset_index(level=0)

variable,A,B,C
0,a,1,2
1,b,3,4
2,c,5,6


variable,A,B,C
0,a,1,2
1,b,3,4
2,c,5,6


#### `wide_to_long()`

Another way to transform is to use the wide_to_long() panel data convenience function. It is less flexible than melt(), but more user-friendly.

https://pandas.pydata.org/pandas-docs/stable/user_guide/reshaping.html

> With stubnames [‘A’, ‘B’], this **function expects to find one or more group of columns with format A-suffix1, A-suffix2,…, B-suffix1, B-suffix2,…** You specify what you want to call this suffix in the resulting long format with j (for example j=’year’)

In [94]:
df = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"},
                   "A1980" : {0 : "d", 1 : "e", 2 : "f"},
                   "B1970" : {0 : 2.5, 1 : 1.2, 2 : .7},
                   "B1980" : {0 : 3.2, 1 : 1.3, 2 : .1},
                   "X"     : dict(zip(range(3), np.random.randn(3)))
                  })
df["id"] = df.index

In [95]:
df

Unnamed: 0,A1970,A1980,B1970,B1980,X,id
0,a,d,2.5,3.2,-0.43764,0
1,b,e,1.2,1.3,-0.17471,1
2,c,f,0.7,0.1,0.543121,2


In [65]:
pd.wide_to_long(df, ["A", "B"], i="id", j="year")

Unnamed: 0_level_0,Unnamed: 1_level_0,X,A,B
id,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1970,-1.392659,a,2.5
1,1970,2.876247,b,1.2
2,1970,-1.186241,c,0.7
0,1980,-1.392659,d,3.2
1,1980,2.876247,e,1.3
2,1980,-1.186241,f,0.1


## Misc

### Series.str

Series is equipped with a set of string processing methods in the str attribute that make it easy to operate on each element of the array, as in the code snippet below. Note that pattern-matching in str generally uses regular expressions by default (and in some cases always uses them).

https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html

In [66]:
s = pd.Series(["A", "B", "C", "Aaba", "Baca", np.nan, "CABA", "dog", "cat"])
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

In [67]:
s.str.capitalize()

0       A
1       B
2       C
3    Aaba
4    Baca
5     NaN
6    Caba
7     Dog
8     Cat
dtype: object

### Recipes

#### Get Top N Rows for Each Group

In [68]:
df = pd.DataFrame({'site_id': {0: 'a', 1: 'a', 2: 'b', 3: 'b', 4: 'c', 5: 'c',6: 'a', 7: 'a', 8: 'b', 9: 'b', 10: 'c', 11: 'c'},
                   'dt': {0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1,6: 2, 7: 2, 8: 2, 9: 2, 10: 2, 11: 2},
                   'eu': {0: 'FGE', 1: 'WSH', 2: 'FGE', 3: 'WSH', 4: 'FGE', 5: 'WSH',6: 'FGE', 7: 'WSH', 8: 'FGE', 9: 'WSH', 10: 'FGE', 11: 'WSH'},
                   'kw': {0: '8', 1: '5', 2: '3', 3: '7', 4: '1', 5: '5',6: '2', 7: '3', 8: '5', 9: '7', 10: '2', 11: '5'}})
df = df.sort_values(by=['site_id', 'kw'])

In [69]:
df

Unnamed: 0,site_id,dt,eu,kw
6,a,2,FGE,2
7,a,2,WSH,3
1,a,1,WSH,5
0,a,1,FGE,8
2,b,1,FGE,3
8,b,2,FGE,5
3,b,1,WSH,7
9,b,2,WSH,7
4,c,1,FGE,1
10,c,2,FGE,2


In [70]:
def top_n(df, n, column):
    return df.sort_values(by=column)[-n:]

top_n(df, n=2, column='kw')

Unnamed: 0,site_id,dt,eu,kw
9,b,2,WSH,7
0,a,1,FGE,8


In [71]:
df.groupby('site_id').apply(top_n, n=2, column='kw')

Unnamed: 0_level_0,Unnamed: 1_level_0,site_id,dt,eu,kw
site_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
a,1,a,1,WSH,5
a,0,a,1,FGE,8
b,3,b,1,WSH,7
b,9,b,2,WSH,7
c,5,c,1,WSH,5
c,11,c,2,WSH,5


## Performance

Accelerated operations

https://pandas.pydata.org/pandas-docs/stable/user_guide/basics.html#accelerated-operations


pandas has support for accelerating certain types of binary numerical and boolean operations using the numexpr library and the bottleneck libraries.

These libraries are especially useful when dealing with large data sets, and provide large speedups. numexpr uses smart chunking, caching, and multiple cores. bottleneck is a set of specialized cython routines that are especially fast when dealing with arrays that have nans.

Here is a sample (using 100 column x 100,000 row DataFrames):

Operation
0.11.0 (ms)
Prior Version (ms)
Ratio to Prior
df1 > df2
13.32
125.35
0.1063
df1 * df2
21.71
36.63
0.5928
df1 + df2
22.04
36.50
0.6039

**You are highly encouraged to install both libraries**. See the section Recommended Dependencies for more installation info.

These are both enabled to be used by default, you can control this by setting the options:

```
pd.set_option("compute.use_bottleneck", False)
pd.set_option("compute.use_numexpr", False)
```