In [1]:
# use lambda and .loc to select rows from a Pandas data frame

import pandas as pd
from pandas import Series, DataFrame

In [2]:
df = pd.read_csv('taxi.csv', usecols=['passenger_count', 'trip_distance', 'total_amount'])
df

Unnamed: 0,passenger_count,trip_distance,total_amount
0,1,1.63,17.80
1,1,0.46,8.30
2,1,0.87,11.00
3,1,2.13,17.16
4,1,1.40,10.30
...,...,...,...
9994,1,2.70,12.30
9995,1,4.50,20.30
9996,1,5.59,22.30
9997,6,1.54,7.80


In [3]:
# how can I get those rows in which the passenger_count was > 4?

# (1) get a boolean series based on passenger_count
df['passenger_count'] > 4

0       False
1       False
2       False
3       False
4       False
        ...  
9994    False
9995    False
9996    False
9997     True
9998    False
Name: passenger_count, Length: 9999, dtype: bool

In [4]:
# use this boolean series to select rows from our data frame, using .loc
# .loc's first argument is a "row selector" -- it can be a bunch of different types

df.loc[   df['passenger_count'] > 4   ]

Unnamed: 0,passenger_count,trip_distance,total_amount
27,5,1.35,8.80
30,6,0.81,9.30
31,5,1.92,15.80
34,5,0.85,9.36
49,5,0.57,5.30
...,...,...,...
9960,5,8.95,35.34
9961,6,2.72,14.16
9963,6,1.52,9.30
9989,6,1.34,8.76


In [6]:
# it turns out that .loc can also be used with a lambda

# when we define a function in Python using "def", we're really doing two things:
# (1) creating a function object
# (2) assigning that function object to a variable

def square(x):
    return x ** 2

square(5) 

25

In [7]:
# if we want, we can create the function object *WITHOUT* assigning it to any variable
# that is done with "lambda" 

lambda x: x**2

<function __main__.<lambda>(x)>

In [8]:
# lambda returns a function object -- we refer to this as an anonymous function

# you really aren't supposed to do this
square2 = lambda x: x**2

In [9]:
square2(6)

36

In [11]:
# what if I don't want to assign it -- can I invoke it?

(lambda x: x**2)(7)

49

In [12]:
# lambda is really useful when I want to create a function that'll be passed to another function,
# and I won't want to use that function again

# inside of .loc, I can pass a function object (normally defined with lambda) that takes one argument (a data frame)
# and then can return a boolean value, indicating if we want the row or not

df.loc[ lambda df_: df_['passenger_count'] > 4 ]

Unnamed: 0,passenger_count,trip_distance,total_amount
27,5,1.35,8.80
30,6,0.81,9.30
31,5,1.92,15.80
34,5,0.85,9.36
49,5,0.57,5.30
...,...,...,...
9960,5,8.95,35.34
9961,6,2.72,14.16
9963,6,1.52,9.30
9989,6,1.34,8.76


In [13]:
df.loc[ lambda df_: (df_['passenger_count'] > 4) & (df_['trip_distance'] < 10) ]

Unnamed: 0,passenger_count,trip_distance,total_amount
27,5,1.35,8.80
30,6,0.81,9.30
31,5,1.92,15.80
34,5,0.85,9.36
49,5,0.57,5.30
...,...,...,...
9960,5,8.95,35.34
9961,6,2.72,14.16
9963,6,1.52,9.30
9989,6,1.34,8.76
