# Agenda: Sorting

1. Series
    - Sort by index
    - Sort by values
2. Data frames
    - Sorting by index
    - Sorting by one column
    - Sorting by multiple columns

In [1]:
import pandas as pd
import numpy as np
from pandas import Series, DataFrame

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


In [2]:
# create a series of random values from -50 to 50

np.random.seed(0)
s = Series(np.random.randint(-50, 50, 10),
           index=list('acegihfjdb'))
s

a    -6
c    -3
e    14
g    17
i    17
h   -41
f    33
j   -29
d   -14
b    37
dtype: int64

In [3]:
# retrieve via the index with .loc
s.loc['e']

14

In [4]:
# fancy index
s.loc[['e', 'a']]

e    14
a    -6
dtype: int64

In [5]:
# slice
s.loc['e':'h']   # up to and including with .loc

e    14
g    17
i    17
h   -41
dtype: int64

In [6]:
# if we want to sort the index, we can use the .sort_index method
# this method returns a new series, with the same index and values as before,
# but with the index sorted

(
    s
    .sort_index()
    .head()
)

a    -6
b    37
c    -3
d   -14
e    14
dtype: int64

In [7]:
# get specific values from this series
(
    s
    .sort_index()
    .head()
    .loc[['b', 'd']]
)

b    37
d   -14
dtype: int64

In [8]:
# sort + head is very common
(
    s
    .sort_index()
    .head()
)

a    -6
b    37
c    -3
d   -14
e    14
dtype: int64

In [9]:
# sort + tail is very common
(
    s
    .sort_index()
    .tail()
)

f    33
g    17
h   -41
i    17
j   -29
dtype: int64

In [10]:
# if you want (but don't!) you can pass inplace=True to sort_index (and other sorting methods).
# If you do that, you'll get None as a result, and the data structure will be modified in place

s.sort_index(inplace=True)

In [11]:
s

a    -6
b    37
c    -3
d   -14
e    14
f    33
g    17
h   -41
i    17
j   -29
dtype: int64

In [12]:
# let's define the series again, but with doubled a and b in the index

np.random.seed(0)
s = Series(np.random.randint(-50, 50, 10),
           index=list('aceaihbjdb'))
s

a    -6
c    -3
e    14
a    17
i    17
h   -41
b    33
j   -29
d   -14
b    37
dtype: int64

In [13]:
# this is totally fine!
s.loc['a']

a    -6
a    17
dtype: int64

In [14]:
s.loc['b']

b    33
b    37
dtype: int64

In [15]:
s.loc['c']

-3

In [16]:
s.loc['a':'c']

KeyError: "Cannot get left slice bound for non-unique label: 'a'"

In [17]:
# in other words: Because 'a' is in the index more than once, Pandas doesn't know
# which 'a' to start with in our slice!

# what if I try something else? Can I start with 'c' and go to 'b'?
s.loc['c':'b']

KeyError: "Cannot get right slice bound for non-unique label: 'b'"

In [18]:
s

a    -6
c    -3
e    14
a    17
i    17
h   -41
b    33
j   -29
d   -14
b    37
dtype: int64

In [19]:
# we can solve this by sorting our index

(
    s
    .sort_index()
    ['a':'c']
)

a    -6
a    17
b    33
b    37
c    -3
dtype: int64

Because now all of the `a` and `b` values are together, there isn't an issue of where to start or stop, and Pandas is happy to give us our slice back.

# Sorting by values

In order to sort by value, we can use the `sort_values` method. This works very similarly, in that we can (but shouldn't) pass `inplace=True`, and it gives us back a new series, with the same index + values as before, but now in a new order.

In [22]:
# in Python, the term "comparable" means that we have values that support not only ==, but also <

s.sort_values()

h   -41
j   -29
d   -14
a    -6
c    -3
e    14
a    17
i    17
b    33
b    37
dtype: int64

In [23]:
# what happens if we have a mix of different types?

s = Series([10, 5, 15, 'b', 'c', 'a'])
s.sort_values()

TypeError: '<' not supported between instances of 'str' and 'int'

In [24]:
help(s.sort_values)

Help on method sort_values in module pandas.core.series:

sort_values(*, axis: 'Axis' = 0, ascending: 'bool | Sequence[bool]' = True, inplace: 'bool' = False, kind: 'SortKind' = 'quicksort', na_position: 'NaPosition' = 'last', ignore_index: 'bool' = False, key: 'ValueKeyFunc | None' = None) -> 'Series | None' method of pandas.core.series.Series instance
    Sort by the values.

    Sort a Series in ascending or descending order by some
    criterion.

    Parameters
    ----------
    axis : {0 or 'index'}
        Unused. Parameter needed for compatibility with DataFrame.
    ascending : bool or list of bools, default True
        If True, sort values in ascending order, otherwise descending.
    inplace : bool, default False
        If True, perform operation in-place.
    kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort'
        Choice of sorting algorithm. See also :func:`numpy.sort` for more
        information. 'mergesort' and 'stable' are the only stabl

# Exercise: Series sorting

1. Create a series based on members of your family (or friends). The index should be strings (their names), and the values will be their ages (integers).
2. Sort by the names. What is the mean age of the first 3 people, alphabetically?
3. Sort by the ages. What are the names of the eldest and youngest people in your series?

In [25]:
s = Series([53, 23, 21, 18],
           index='Reuven Atara Shikma Amotz'.split())
s

Reuven    53
Atara     23
Shikma    21
Amotz     18
dtype: int64

In [28]:
(
    s
    .sort_index()
    .head(3)
    .mean()
)

31.333333333333332

In [37]:
(
    s
    .sort_values()
    .iloc[[0, -1]]   # give me the first and last elements
    .index
)

Index(['Amotz', 'Reuven'], dtype='object')

In [32]:
(
    s
    .idxmin()
)

'Amotz'

In [33]:
(
    s
    .idxmax()
)

'Reuven'

In [34]:
# what if we want both the min and the max at the same time?
# we can use the .agg method, which takes a list of strings describing the methods we want to run

(
    s
    .agg(['idxmin', 'idxmax'])
)

idxmin     Amotz
idxmax    Reuven
dtype: object

In [38]:
# what if we want to sort in descending order?
# so far, we've seen that both sort_index and sort_values sorts in ascending order

s.sort_values(ascending=False)

Reuven    53
Atara     23
Shikma    21
Amotz     18
dtype: int64

In [39]:
s.sort_index(ascending=False)

Shikma    21
Reuven    53
Atara     23
Amotz     18
dtype: int64

# How can we change the way that they are compared?



In [40]:
np.random.seed(0)

s = Series(np.random.randint(-50, 50, 10),
           index=list('abcdefghij'))
s

a    -6
b    -3
c    14
d    17
e    17
f   -41
g    33
h   -29
i   -14
j    37
dtype: int64

In [41]:
# how can I sort these numbers?
s.sort_values()

f   -41
h   -29
i   -14
a    -6
b    -3
c    14
d    17
e    17
g    33
j    37
dtype: int64

In [42]:
# how can we sort them by absolute value?
# one option: invoke .abs on the series, get the absolute values, and then sort

s.abs().sort_values()

b     3
a     6
c    14
i    14
d    17
e    17
h    29
g    33
j    37
f    41
dtype: int64

In [49]:
# what I really want is to sort them by absolute value *AND* keep the original values

# we can pass a keyword argument of "key" to sort_index or sort_values
# the value for the "key" argument is a function. 
# that function is invoked on each element, and the result of the function is then used for sorting purposes only
# we don't see the function's output in the returned values.

# the function that you pass to key *must* be able to work on a Pandas series,
# returning a new series as a result

# you can use:
# - a NumPy/Pandas method that takes a series
# - your own function/method that takes a series
# - lambda, which lets you create an anonymous function that returns something

s.sort_values(key=abs)

b    -3
a    -6
c    14
i   -14
d    17
e    17
h   -29
g    33
j    37
f   -41
dtype: int64

In [50]:
# sort by the final digits

def get_final_digit(a_series):   # takes a series
    return (
        a_series
        .astype(str)
        .str.get(-1)
        .astype(int)  # turn into a string, get the final digit, then turn into an integer
    )

s = Series([10, 15, 22, 28])
get_final_digit(s)  


0    0
1    5
2    2
3    8
dtype: int64

In [51]:
np.random.seed(0)

s = Series(np.random.randint(-50, 50, 10),
           index=list('abcdefghij'))

s.sort_values(key=get_final_digit)   # I'm not invoking the function! sort_values will invoke it on our series, and use the result for sorting

f   -41
b    -3
g    33
c    14
i   -14
a    -6
d    17
e    17
j    37
h   -29
dtype: int64