# Method chaining in Pandas



In [1]:
# method chaining in Python

s = 'abcde fghij'

s.title()    # call a method on s, and I get a new string back 


'Abcde Fghij'

In [2]:
# what if, after capitalizing each word, I then want to get a list of strings (words) back?

s.title().split()   # this is method chaining!

['Abcde', 'Fghij']

In [3]:
# sometimes, a method chain can be a bit long and/or complicated
# how can I break it up across lines, to be more readable?

s
.title()
.split()


SyntaxError: invalid syntax (4287639807.py, line 5)

In [5]:
# but we can take advantage of Python's syntax with ()

(
    s
    .title()    # make each word start with a capital letter
    .split()    # get back a list of words, one word per string
)

['Abcde', 'Fghij']

In [6]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [7]:
# new series of random integers 0-100

np.random.seed(0)
s = Series(np.random.randint(0, 100, 10),
           index=list('abcdefghij'))
s

a    44
b    47
c    64
d    67
e    67
f     9
g    83
h    21
i    36
j    87
dtype: int64

In [10]:
# how can I get all of the odd numbers from this series?

s.loc[s % 2 == 1]    # .loc takes a row (element) selector -- here, we're giving it a boolean series (mask index)

b    47
d    67
e    67
f     9
g    83
h    21
j    87
dtype: int64

In [11]:
# what if I only want the first 5 odd numbers from s?

(
    s
    .loc[s % 2 == 1]
    .head(5) 
)

b    47
d    67
e    67
f     9
g    83
dtype: int64

In [12]:
# let's create a series of strings

s = Series('this is a fantastic and exciting and wonderful and terrific sentence for demonstrating chaining'.split())
s

0              this
1                is
2                 a
3         fantastic
4               and
5          exciting
6               and
7         wonderful
8               and
9          terrific
10         sentence
11              for
12    demonstrating
13         chaining
dtype: object

In [13]:
# how can I get the length of each word? .str.len()

s.str.len()

0      4
1      2
2      1
3      9
4      3
5      8
6      3
7      9
8      3
9      8
10     8
11     3
12    13
13     8
dtype: int64

In [15]:
# what if I want all of the words that are >= 4 in length?

s.loc[s.str.len() >= 4]

0              this
3         fantastic
5          exciting
7         wonderful
9          terrific
10         sentence
12    demonstrating
13         chaining
dtype: object

In [16]:
# what if I now want only those words in which we have at least 3 vowels?

def count_vowels(one_word):
    total = 0
    for one_character in one_word:
        if one_character in 'aeiou':
            total += 1
    return total

In [17]:
s.apply(count_vowels)

0     1
1     1
2     1
3     3
4     1
5     3
6     1
7     3
8     1
9     3
10    3
11    1
12    4
13    3
dtype: int64

In [20]:
# I want to find all of the words that are (a) >= 4 in length and (b) have at least 3 vowels

(
    s
    .loc[s.str.len() >= 4]
    .loc[lambda s_: s_.apply(count_vowels) >= 3]   # s_ is a local variable, temporarily assigned in our function
)

3         fantastic
5          exciting
7         wonderful
9          terrific
10         sentence
12    demonstrating
13         chaining
dtype: object

In [23]:
df = pd.read_csv('/Users/reuven/Courses/Current/data/nyc_taxi_2019-01.csv',
                usecols=['passenger_count', 'total_amount', 'trip_distance'])

In [25]:
# let's find all taxi trips in NYC where the passenger_count was > 5 and the total_amount
# was > 100

(
    df
    .loc[df['total_amount'] > 100]
    .loc[df['passenger_count'] > 5]
)

Unnamed: 0,passenger_count,trip_distance,total_amount
28869,6,40.77,168.80
49225,8,5.08,109.56
64401,6,21.24,113.64
71391,6,31.10,104.47
73967,6,24.58,126.96
...,...,...,...
7516689,6,21.30,114.24
7566742,6,44.58,238.80
7649986,6,30.72,124.56
7655036,6,31.14,110.08


In [26]:
# what if I want to find out how much the person spent per mile?

# I could assign it!
# df['amount_per_mile'] = df['total_amount'] / df['trip_distance']

0          6.633333
1          6.269231
2               inf
3               inf
4               inf
             ...   
7667787    4.835073
7667788         NaN
7667789         NaN
7667790         NaN
7667791         NaN
Length: 7667792, dtype: float64

In [30]:
(
    df
    .assign(amount_per_mile = df['total_amount'] / df['trip_distance'])

    # I want rows where amount_per_mile > 5
    .loc[lambda df_: df_['amount_per_mile'] > 5]
)

Unnamed: 0,passenger_count,trip_distance,total_amount,amount_per_mile
0,1,1.50,9.95,6.633333
1,1,2.60,16.30,6.269231
2,3,0.00,5.80,inf
3,5,0.00,7.55,inf
4,5,0.00,55.55,inf
...,...,...,...,...
7667780,3,1.14,9.96,8.736842
7667781,3,1.89,10.80,5.714286
7667784,1,1.34,9.30,6.940299
7667785,1,1.45,14.16,9.765517
