# Agenda: Sorting

1. Series
    - Sort by index
    - Sort by values
2. Data frames
    - Sorting by index
    - Sorting by one column
    - Sorting by multiple columns

In [1]:
import pandas as pd
import numpy as np
from pandas import Series, DataFrame

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


In [2]:
# create a series of random values from -50 to 50

np.random.seed(0)
s = Series(np.random.randint(-50, 50, 10),
           index=list('acegihfjdb'))
s

a    -6
c    -3
e    14
g    17
i    17
h   -41
f    33
j   -29
d   -14
b    37
dtype: int64

In [3]:
# retrieve via the index with .loc
s.loc['e']

14

In [4]:
# fancy index
s.loc[['e', 'a']]

e    14
a    -6
dtype: int64

In [5]:
# slice
s.loc['e':'h']   # up to and including with .loc

e    14
g    17
i    17
h   -41
dtype: int64

In [6]:
# if we want to sort the index, we can use the .sort_index method
# this method returns a new series, with the same index and values as before,
# but with the index sorted

(
    s
    .sort_index()
    .head()
)

a    -6
b    37
c    -3
d   -14
e    14
dtype: int64

In [7]:
# get specific values from this series
(
    s
    .sort_index()
    .head()
    .loc[['b', 'd']]
)

b    37
d   -14
dtype: int64

In [8]:
# sort + head is very common
(
    s
    .sort_index()
    .head()
)

a    -6
b    37
c    -3
d   -14
e    14
dtype: int64

In [9]:
# sort + tail is very common
(
    s
    .sort_index()
    .tail()
)

f    33
g    17
h   -41
i    17
j   -29
dtype: int64

In [10]:
# if you want (but don't!) you can pass inplace=True to sort_index (and other sorting methods).
# If you do that, you'll get None as a result, and the data structure will be modified in place

s.sort_index(inplace=True)

In [11]:
s

a    -6
b    37
c    -3
d   -14
e    14
f    33
g    17
h   -41
i    17
j   -29
dtype: int64

In [12]:
# let's define the series again, but with doubled a and b in the index

np.random.seed(0)
s = Series(np.random.randint(-50, 50, 10),
           index=list('aceaihbjdb'))
s

a    -6
c    -3
e    14
a    17
i    17
h   -41
b    33
j   -29
d   -14
b    37
dtype: int64

In [13]:
# this is totally fine!
s.loc['a']

a    -6
a    17
dtype: int64

In [14]:
s.loc['b']

b    33
b    37
dtype: int64

In [15]:
s.loc['c']

-3

In [16]:
s.loc['a':'c']

KeyError: "Cannot get left slice bound for non-unique label: 'a'"

In [17]:
# in other words: Because 'a' is in the index more than once, Pandas doesn't know
# which 'a' to start with in our slice!

# what if I try something else? Can I start with 'c' and go to 'b'?
s.loc['c':'b']

KeyError: "Cannot get right slice bound for non-unique label: 'b'"

In [18]:
s

a    -6
c    -3
e    14
a    17
i    17
h   -41
b    33
j   -29
d   -14
b    37
dtype: int64

In [19]:
# we can solve this by sorting our index

(
    s
    .sort_index()
    ['a':'c']
)

a    -6
a    17
b    33
b    37
c    -3
dtype: int64

Because now all of the `a` and `b` values are together, there isn't an issue of where to start or stop, and Pandas is happy to give us our slice back.

# Sorting by values

In order to sort by value, we can use the `sort_values` method. This works very similarly, in that we can (but shouldn't) pass `inplace=True`, and it gives us back a new series, with the same index + values as before, but now in a new order.

In [22]:
# in Python, the term "comparable" means that we have values that support not only ==, but also <

s.sort_values()

h   -41
j   -29
d   -14
a    -6
c    -3
e    14
a    17
i    17
b    33
b    37
dtype: int64

In [23]:
# what happens if we have a mix of different types?

s = Series([10, 5, 15, 'b', 'c', 'a'])
s.sort_values()

TypeError: '<' not supported between instances of 'str' and 'int'

In [24]:
help(s.sort_values)

Help on method sort_values in module pandas.core.series:

sort_values(*, axis: 'Axis' = 0, ascending: 'bool | Sequence[bool]' = True, inplace: 'bool' = False, kind: 'SortKind' = 'quicksort', na_position: 'NaPosition' = 'last', ignore_index: 'bool' = False, key: 'ValueKeyFunc | None' = None) -> 'Series | None' method of pandas.core.series.Series instance
    Sort by the values.

    Sort a Series in ascending or descending order by some
    criterion.

    Parameters
    ----------
    axis : {0 or 'index'}
        Unused. Parameter needed for compatibility with DataFrame.
    ascending : bool or list of bools, default True
        If True, sort values in ascending order, otherwise descending.
    inplace : bool, default False
        If True, perform operation in-place.
    kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort'
        Choice of sorting algorithm. See also :func:`numpy.sort` for more
        information. 'mergesort' and 'stable' are the only stabl

# Exercise: Series sorting

1. Create a series based on members of your family (or friends). The index should be strings (their names), and the values will be their ages (integers).
2. Sort by the names. What is the mean age of the first 3 people, alphabetically?
3. Sort by the ages. What are the names of the eldest and youngest people in your series?

In [25]:
s = Series([53, 23, 21, 18],
           index='Reuven Atara Shikma Amotz'.split())
s

Reuven    53
Atara     23
Shikma    21
Amotz     18
dtype: int64

In [28]:
(
    s
    .sort_index()
    .head(3)
    .mean()
)

31.333333333333332

In [37]:
(
    s
    .sort_values()
    .iloc[[0, -1]]   # give me the first and last elements
    .index
)

Index(['Amotz', 'Reuven'], dtype='object')

In [32]:
(
    s
    .idxmin()
)

'Amotz'

In [33]:
(
    s
    .idxmax()
)

'Reuven'

In [34]:
# what if we want both the min and the max at the same time?
# we can use the .agg method, which takes a list of strings describing the methods we want to run

(
    s
    .agg(['idxmin', 'idxmax'])
)

idxmin     Amotz
idxmax    Reuven
dtype: object

In [38]:
# what if we want to sort in descending order?
# so far, we've seen that both sort_index and sort_values sorts in ascending order

s.sort_values(ascending=False)

Reuven    53
Atara     23
Shikma    21
Amotz     18
dtype: int64

In [39]:
s.sort_index(ascending=False)

Shikma    21
Reuven    53
Atara     23
Amotz     18
dtype: int64

# How can we change the way that they are compared?



In [40]:
np.random.seed(0)

s = Series(np.random.randint(-50, 50, 10),
           index=list('abcdefghij'))
s

a    -6
b    -3
c    14
d    17
e    17
f   -41
g    33
h   -29
i   -14
j    37
dtype: int64

In [41]:
# how can I sort these numbers?
s.sort_values()

f   -41
h   -29
i   -14
a    -6
b    -3
c    14
d    17
e    17
g    33
j    37
dtype: int64

In [42]:
# how can we sort them by absolute value?
# one option: invoke .abs on the series, get the absolute values, and then sort

s.abs().sort_values()

b     3
a     6
c    14
i    14
d    17
e    17
h    29
g    33
j    37
f    41
dtype: int64

In [49]:
# what I really want is to sort them by absolute value *AND* keep the original values

# we can pass a keyword argument of "key" to sort_index or sort_values
# the value for the "key" argument is a function. 
# that function is invoked on each element, and the result of the function is then used for sorting purposes only
# we don't see the function's output in the returned values.

# the function that you pass to key *must* be able to work on a Pandas series,
# returning a new series as a result

# you can use:
# - a NumPy/Pandas method that takes a series
# - your own function/method that takes a series
# - lambda, which lets you create an anonymous function that returns something

s.sort_values(key=abs)

b    -3
a    -6
c    14
i   -14
d    17
e    17
h   -29
g    33
j    37
f   -41
dtype: int64

In [50]:
# sort by the final digits

def get_final_digit(a_series):   # takes a series
    return (
        a_series
        .astype(str)
        .str.get(-1)
        .astype(int)  # turn into a string, get the final digit, then turn into an integer
    )

s = Series([10, 15, 22, 28])
get_final_digit(s)  


0    0
1    5
2    2
3    8
dtype: int64

In [51]:
np.random.seed(0)

s = Series(np.random.randint(-50, 50, 10),
           index=list('abcdefghij'))

s.sort_values(key=get_final_digit)   # I'm not invoking the function! sort_values will invoke it on our series, and use the result for sorting

f   -41
b    -3
g    33
c    14
i   -14
a    -6
d    17
e    17
j    37
h   -29
dtype: int64

In [54]:
# if we want, we can also use lambda to create an anonymous function
# the function is defined inline (it must be a single line long, and it cannot include any statements, only one expression)

s.sort_values(key=lambda a_series: a_series.astype(str).str.get(-1).astype(int),
             kind='stable')

f   -41
b    -3
g    33
c    14
i   -14
a    -6
d    17
e    17
j    37
h   -29
dtype: int64

# Euro Python talk: How to sort anything

https://www.youtube.com/watch?v=Z3c2LvEJeu0

# Data frames

In many ways, sorting data frames is the same as sorting series:

- We can sort by the index
- We can sort by a column

Where things get more interesting is when we sort by more than one column, basically telling Pandas how we want to break ties.

In [55]:
filename = '../data/taxi.csv'
df = pd.read_csv(filename)
df

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,2,2015-06-02 11:19:29,2015-06-02 11:47:52,1,1.63,-73.954430,40.764141,1,N,-73.974754,40.754093,2,17.0,0.0,0.5,0.00,0.0,0.3,17.80
1,2,2015-06-02 11:19:30,2015-06-02 11:27:56,1,0.46,-73.971443,40.758942,1,N,-73.978539,40.761909,1,6.5,0.0,0.5,1.00,0.0,0.3,8.30
2,2,2015-06-02 11:19:31,2015-06-02 11:30:30,1,0.87,-73.978111,40.738434,1,N,-73.990273,40.745438,1,8.0,0.0,0.5,2.20,0.0,0.3,11.00
3,2,2015-06-02 11:19:31,2015-06-02 11:39:02,1,2.13,-73.945892,40.773529,1,N,-73.971527,40.760330,1,13.5,0.0,0.5,2.86,0.0,0.3,17.16
4,1,2015-06-02 11:19:32,2015-06-02 11:32:49,1,1.40,-73.979088,40.776772,1,N,-73.982162,40.758999,2,9.5,0.0,0.5,0.00,0.0,0.3,10.30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9994,1,2015-06-01 00:12:59,2015-06-01 00:24:18,1,2.70,-73.947792,40.814972,1,N,-73.973358,40.783638,2,11.0,0.5,0.5,0.00,0.0,0.3,12.30
9995,1,2015-06-01 00:12:59,2015-06-01 00:28:16,1,4.50,-74.004066,40.747818,1,N,-73.953758,40.779285,1,16.0,0.5,0.5,3.00,0.0,0.3,20.30
9996,2,2015-06-01 00:13:00,2015-06-01 00:37:25,1,5.59,-73.994377,40.766102,1,N,-73.903206,40.750546,2,21.0,0.5,0.5,0.00,0.0,0.3,22.30
9997,2,2015-06-01 00:13:02,2015-06-01 00:19:10,6,1.54,-73.978302,40.748531,1,N,-73.989166,40.762852,2,6.5,0.5,0.5,0.00,0.0,0.3,7.80


In [56]:
# I'm going to make the tpep_pickup_datetime column into the index for my data frame
# I can do this with the set_index method
# just like sort_* methods, set_index *can* take inplace=True, but we should not do it

df = df.set_index('tpep_pickup_datetime')
df

Unnamed: 0_level_0,VendorID,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2015-06-02 11:19:29,2,2015-06-02 11:47:52,1,1.63,-73.954430,40.764141,1,N,-73.974754,40.754093,2,17.0,0.0,0.5,0.00,0.0,0.3,17.80
2015-06-02 11:19:30,2,2015-06-02 11:27:56,1,0.46,-73.971443,40.758942,1,N,-73.978539,40.761909,1,6.5,0.0,0.5,1.00,0.0,0.3,8.30
2015-06-02 11:19:31,2,2015-06-02 11:30:30,1,0.87,-73.978111,40.738434,1,N,-73.990273,40.745438,1,8.0,0.0,0.5,2.20,0.0,0.3,11.00
2015-06-02 11:19:31,2,2015-06-02 11:39:02,1,2.13,-73.945892,40.773529,1,N,-73.971527,40.760330,1,13.5,0.0,0.5,2.86,0.0,0.3,17.16
2015-06-02 11:19:32,1,2015-06-02 11:32:49,1,1.40,-73.979088,40.776772,1,N,-73.982162,40.758999,2,9.5,0.0,0.5,0.00,0.0,0.3,10.30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015-06-01 00:12:59,1,2015-06-01 00:24:18,1,2.70,-73.947792,40.814972,1,N,-73.973358,40.783638,2,11.0,0.5,0.5,0.00,0.0,0.3,12.30
2015-06-01 00:12:59,1,2015-06-01 00:28:16,1,4.50,-74.004066,40.747818,1,N,-73.953758,40.779285,1,16.0,0.5,0.5,3.00,0.0,0.3,20.30
2015-06-01 00:13:00,2,2015-06-01 00:37:25,1,5.59,-73.994377,40.766102,1,N,-73.903206,40.750546,2,21.0,0.5,0.5,0.00,0.0,0.3,22.30
2015-06-01 00:13:02,2,2015-06-01 00:19:10,6,1.54,-73.978302,40.748531,1,N,-73.989166,40.762852,2,6.5,0.5,0.5,0.00,0.0,0.3,7.80


In [57]:
# what if I want to retrieve a slice of values from my data frame?

df.loc['2015-06-01 00:12:59':'2015-06-01 00:13:04']

KeyError: "Cannot get left slice bound for non-unique label: '2015-06-01 00:12:59'"

In [58]:
# we can sort our data frame by the index

(
    df
    .sort_index()
    .loc['2015-06-01 00:12:59':'2015-06-01 00:13:04']
)

Unnamed: 0_level_0,VendorID,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2015-06-01 00:12:59,2,2015-06-01 00:25:04,1,2.89,-74.003098,40.718269,1,N,-73.999634,40.687263,2,11.5,0.5,0.5,0.0,0.0,0.3,12.8
2015-06-01 00:12:59,1,2015-06-01 00:24:18,1,2.7,-73.947792,40.814972,1,N,-73.973358,40.783638,2,11.0,0.5,0.5,0.0,0.0,0.3,12.3
2015-06-01 00:12:59,1,2015-06-01 00:26:17,1,4.1,-73.994362,40.727089,1,N,-73.993248,40.73317,1,15.0,0.5,0.5,3.26,0.0,0.3,19.56
2015-06-01 00:12:59,1,2015-06-01 00:18:08,1,0.7,-73.985619,40.760563,1,N,-73.986572,40.766663,1,5.5,0.5,0.5,1.35,0.0,0.3,8.15
2015-06-01 00:12:59,2,2015-06-01 00:14:07,1,0.18,-74.005539,40.725544,1,N,-74.002983,40.725056,1,3.0,0.5,0.5,0.86,0.0,0.3,5.16
2015-06-01 00:12:59,1,2015-06-01 00:28:16,1,4.5,-74.004066,40.747818,1,N,-73.953758,40.779285,1,16.0,0.5,0.5,3.0,0.0,0.3,20.3
2015-06-01 00:13:00,2,2015-06-01 00:37:25,1,5.59,-73.994377,40.766102,1,N,-73.903206,40.750546,2,21.0,0.5,0.5,0.0,0.0,0.3,22.3
2015-06-01 00:13:00,2,2015-06-01 00:20:16,1,12.56,-73.948746,40.741535,2,N,-73.971687,40.7439,2,52.0,0.0,0.5,0.0,5.54,0.3,58.34
2015-06-01 00:13:00,2,2015-06-01 00:30:49,1,4.91,-73.985291,40.741871,1,N,-73.935081,40.767193,1,17.0,0.5,0.5,5.49,0.0,0.3,23.79
2015-06-01 00:13:00,2,2015-06-01 00:22:16,1,2.5,-73.985794,40.770344,1,N,-74.003677,40.740116,2,9.5,0.5,0.5,0.0,0.0,0.3,10.8


# Exercise: Sorting cities

Last time, we worked with a JSON file containing population information for the 1,000 largest cities in the US. We're going to use that JSON file again, which is at URL:

    https://gist.githubusercontent.com/reuven/77edbb0292901f35019f17edb9794358/raw/2bf258763cdddd704f8ffd3ea9a3e81d25e2c6f6/cities.json

Read that data into a data frame, and then:
1. Turn the city name into the index. What is the mean population for the first 20 cities, alphabetically?
2. Turn the state name into the index. What is the mean population for all cities, alphabetically, from Iowa to Nebraska?
3. Turn the population into the index. What is the mean latitude for the 50 largest cities vs. the 50 smallest cities?

In [59]:
url = 'https://gist.githubusercontent.com/reuven/77edbb0292901f35019f17edb9794358/raw/2bf258763cdddd704f8ffd3ea9a3e81d25e2c6f6/cities.json'

df = pd.read_json(url)
df

Unnamed: 0,city,growth_from_2000_to_2013,latitude,longitude,population,rank,state
0,New York,4.8%,40.712784,-74.005941,8405837,1,New York
1,Los Angeles,4.8%,34.052234,-118.243685,3884307,2,California
2,Chicago,-6.1%,41.878114,-87.629798,2718782,3,Illinois
3,Houston,11.0%,29.760427,-95.369803,2195914,4,Texas
4,Philadelphia,2.6%,39.952584,-75.165222,1553165,5,Pennsylvania
...,...,...,...,...,...,...,...
995,Weslaco,28.8%,26.159519,-97.990837,37093,996,Texas
996,Keizer,14.4%,44.990119,-123.026208,37064,997,Oregon
997,Spanish Fork,78.1%,40.114955,-111.654923,36956,998,Utah
998,Beloit,2.9%,42.508348,-89.031776,36888,999,Wisconsin


In [64]:
# Turn the city name into the index. What is the mean population for the first 20 cities, alphabetically?

(
    df
    .set_index('city')
    .sort_index()
    .head(20)
    ['population']
    .mean()
)

125541.7

In [65]:
(
    df
    .set_index('city')
    .sort_index()
    .head(20)
    ['population']
    .agg(['mean', 'median'])
)

mean      125541.7
median     80498.0
Name: population, dtype: float64

In [69]:
# Turn the state name into the index. What is the mean population for all cities, alphabetically, from Iowa to Nebraska?

(
    df
    .set_index('state')
    .sort_index()
    ['Iowa':'Nebraska']
    ['population']
    .mean()    
)

102408.45508982036

In [70]:
# Turn the population into the index. What is the mean latitude for the 50 largest cities vs. the 50 smallest cities?

(
    df
    .set_index('population')
    .sort_index()
    ['latitude']
    .head(50)
    .mean()
)

37.303587242

In [71]:
(
    df
    .set_index('population')
    .sort_index()
    ['latitude']
    .tail(50)
    .mean()
)

36.838639806

# Sorting by values

We can use the `sort_values` method with a data frame. The only difference is that we need to specify which column (or columns) we want to use for our sorting.

- If we pass a single column name, that is used for sorting the entire data frame, row by row
- If we pass a list of coulumn names, then the first is used for sorting, and if there is a tie on the first, then it uses the second, etc.

In [72]:
df.sort_values('population')

Unnamed: 0,city,growth_from_2000_to_2013,latitude,longitude,population,rank,state
999,Panama City,0.1%,30.158813,-85.660206,36877,1000,Florida
998,Beloit,2.9%,42.508348,-89.031776,36888,999,Wisconsin
997,Spanish Fork,78.1%,40.114955,-111.654923,36956,998,Utah
996,Keizer,14.4%,44.990119,-123.026208,37064,997,Oregon
995,Weslaco,28.8%,26.159519,-97.990837,37093,996,Texas
...,...,...,...,...,...,...,...
4,Philadelphia,2.6%,39.952584,-75.165222,1553165,5,Pennsylvania
3,Houston,11.0%,29.760427,-95.369803,2195914,4,Texas
2,Chicago,-6.1%,41.878114,-87.629798,2718782,3,Illinois
1,Los Angeles,4.8%,34.052234,-118.243685,3884307,2,California


In [73]:
df.sort_values('population', ascending=False)

Unnamed: 0,city,growth_from_2000_to_2013,latitude,longitude,population,rank,state
0,New York,4.8%,40.712784,-74.005941,8405837,1,New York
1,Los Angeles,4.8%,34.052234,-118.243685,3884307,2,California
2,Chicago,-6.1%,41.878114,-87.629798,2718782,3,Illinois
3,Houston,11.0%,29.760427,-95.369803,2195914,4,Texas
4,Philadelphia,2.6%,39.952584,-75.165222,1553165,5,Pennsylvania
...,...,...,...,...,...,...,...
995,Weslaco,28.8%,26.159519,-97.990837,37093,996,Texas
996,Keizer,14.4%,44.990119,-123.026208,37064,997,Oregon
997,Spanish Fork,78.1%,40.114955,-111.654923,36956,998,Utah
998,Beloit,2.9%,42.508348,-89.031776,36888,999,Wisconsin


In [75]:
(
    df
    .sort_values(['state', 'city'])
    [['city', 'state', 'population']]
    .head(20)
)

Unnamed: 0,city,state,population
614,Auburn,Alabama,58582
100,Birmingham,Alabama,212113
652,Decatur,Alabama,55816
501,Dothan,Alabama,68001
921,Florence,Alabama,40059
375,Hoover,Alabama,84126
125,Huntsville,Alabama,186254
810,Madison,Alabama,45799
121,Mobile,Alabama,194899
110,Montgomery,Alabama,201332
