# Data frame

1. Creating a data frame
    - Manually
    - From CSV and other files
3. Methods on a data frame
4. `.loc` and `.iloc`
5. `nan` and data frames
6. Boolean indexes and data frames



In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
df = DataFrame([[10, 20, 30],
               [40, 50, 60],
               [70, 80, 90],
               [100, 110, 120]])
df

Unnamed: 0,0,1,2
0,10,20,30
1,40,50,60
2,70,80,90
3,100,110,120


In [3]:
df.dtypes

0    int64
1    int64
2    int64
dtype: object

In [4]:
df = DataFrame([[10, 20, 30],
               [40, 50, 60],
               [70, 80, 90],
               [100, 110, 120]],
              index=list('abcd'))
df

Unnamed: 0,0,1,2
a,10,20,30
b,40,50,60
c,70,80,90
d,100,110,120


In [5]:
df = DataFrame([[10, 20, 30],
               [40, 50, 60],
               [70, 80, 90],
               [100, 110, 120]],
              index=list('abcd'),
              columns=list('xyz'))
df

Unnamed: 0,x,y,z
a,10,20,30
b,40,50,60
c,70,80,90
d,100,110,120


In [6]:
# retrieve from a row... using loc/iloc

df.loc['a']

x    10
y    20
z    30
Name: a, dtype: int64

In [7]:
df.iloc[2]

x    70
y    80
z    90
Name: c, dtype: int64

In [8]:
# fancy indexing

df.loc[['a', 'c']]

Unnamed: 0,x,y,z
a,10,20,30
c,70,80,90


In [9]:
# slice

df.loc['a':'c']

Unnamed: 0,x,y,z
a,10,20,30
b,40,50,60
c,70,80,90


In [10]:
df.iloc[0:2]

Unnamed: 0,x,y,z
a,10,20,30
b,40,50,60


In [11]:
# columns -- use []

df['x']

a     10
b     40
c     70
d    100
Name: x, dtype: int64

In [12]:
# more than one column

df[['x', 'z']]

Unnamed: 0,x,z
a,10,30
b,40,60
c,70,90
d,100,120


In [13]:
# watch out -- slices!

df['b':'d']

Unnamed: 0,x,y,z
b,40,50,60
c,70,80,90
d,100,110,120


In [14]:
df['b':'b']

Unnamed: 0,x,y,z
b,40,50,60


In [15]:
# we can use dots for columns

df.x

a     10
b     40
c     70
d    100
Name: x, dtype: int64

In [16]:
# general rule: any series method will also work on a data frame,
# and we will get a result for each column

df

Unnamed: 0,x,y,z
a,10,20,30
b,40,50,60
c,70,80,90
d,100,110,120


In [17]:
df['x'].mean()

np.float64(55.0)

In [18]:
df.mean()

x    55.0
y    65.0
z    75.0
dtype: float64

In [19]:
df.std()

x    38.729833
y    38.729833
z    38.729833
dtype: float64

In [20]:
df.min()

x    10
y    20
z    30
dtype: int64

In [22]:
df.mean(axis='columns')

a     20.0
b     50.0
c     80.0
d    110.0
dtype: float64

In [23]:
df['x'].describe()

count      4.000000
mean      55.000000
std       38.729833
min       10.000000
25%       32.500000
50%       55.000000
75%       77.500000
max      100.000000
Name: x, dtype: float64

In [24]:
df.describe()

Unnamed: 0,x,y,z
count,4.0,4.0,4.0
mean,55.0,65.0,75.0
std,38.729833,38.729833,38.729833
min,10.0,20.0,30.0
25%,32.5,42.5,52.5
50%,55.0,65.0,75.0
75%,77.5,87.5,97.5
max,100.0,110.0,120.0


In [25]:
df['x'] > df['x'].mean()

a    False
b    False
c     True
d     True
Name: x, dtype: bool

In [27]:
# we can use .loc to filter elements with a boolean index
df['x'].loc[   df['x'] > df['x'].mean()   ]

c     70
d    100
Name: x, dtype: int64

In [28]:
# find elements of y where x is greater than the mean
df['y'].loc[   df['x'] > df['x'].mean()   ]

c     80
d    110
Name: y, dtype: int64

In [29]:
# find all rows of df where x > mean
df.loc[   df['x'] > df['x'].mean()   ]

Unnamed: 0,x,y,z
c,70,80,90
d,100,110,120


In [30]:
# change the index

df

Unnamed: 0,x,y,z
a,10,20,30
b,40,50,60
c,70,80,90
d,100,110,120


In [31]:
df.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [32]:
df.index = ['one', 'two', 'three', 'four']
df

Unnamed: 0,x,y,z
one,10,20,30
two,40,50,60
three,70,80,90
four,100,110,120


In [33]:
df.columns

Index(['x', 'y', 'z'], dtype='object')

In [34]:
df.columns = ['col1', 'col2', 'col3']
df

Unnamed: 0,col1,col2,col3
one,10,20,30
two,40,50,60
three,70,80,90
four,100,110,120


In [35]:
df['col4'] = [11, 12, 13, 14]

In [36]:
df

Unnamed: 0,col1,col2,col3,col4
one,10,20,30,11
two,40,50,60,12
three,70,80,90,13
four,100,110,120,14


In [37]:
df['col4'] = [110, 120, 130, 140]
df

Unnamed: 0,col1,col2,col3,col4
one,10,20,30,110
two,40,50,60,120
three,70,80,90,130
four,100,110,120,140


In [38]:
df = DataFrame(np.random.randint(0, 1000, [4, 3]),
              index=list('abcd'),
              columns=list('xyz'))
df

Unnamed: 0,x,y,z
a,934,178,53
b,153,486,515
c,758,893,113
d,582,51,336


# Exercise: Data frames

1. Create a data frame with random integers from 0-1,000, with 5 rows and 4 columns. Give the rows names of a-e and the columns w-z.
2. Find the mean + std for columns w and y.
3. Find the mean + std + max for rows b, d, and e.
4. Get all rows where y > y.mean().

In [39]:
df = DataFrame(np.random.randint(0, 1000, [5, 4]),
              index=list('abcde'),
              columns=list('wxyz'))
df

Unnamed: 0,w,x,y,z
a,610,14,753,338
b,686,188,174,411
c,214,711,385,998
d,706,713,311,403
e,249,253,263,903


In [41]:
# 2. Find the mean + std for columns w and y.

df[['w', 'y']].mean()

w    493.0
y    377.2
dtype: float64

In [42]:
df[['w', 'y']].std()

w    241.704365
y    223.607245
dtype: float64

In [47]:
# method chaining

(
    df[['w', 'y']]     # just columns w+y
    .describe()        # run describe
    .loc[['mean', 'std']]    # retreive only mean, std
)

Unnamed: 0,w,y
mean,493.0,377.2
std,241.704365,223.607245


In [49]:
# 3. Find the mean + std + max for rows b, d, and e.

(
    df
    .loc[['b', 'd', 'e']]
    .agg(['mean', 'std', 'max'])
)

Unnamed: 0,w,x,y,z
mean,547.0,384.666667,249.333333,572.333333
std,258.269239,286.19632,69.514986,286.393668
max,706.0,713.0,311.0,903.0


In [50]:
# 4. Get all rows where y > y.mean().

(
    df
    .loc[df['y'] > df['y'].mean()]
)

Unnamed: 0,w,x,y,z
a,610,14,753,338
c,214,711,385,998


In [51]:
# https://files.lerner.co.il/data-science-exercise-files.zip

In [54]:
filename = 'taxi.csv'

!head $filename

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
2,2015-06-02 11:19:29,2015-06-02 11:47:52,1,1.63,-73.954429626464844,40.764141082763672,1,N,-73.974754333496094,40.754093170166016,2,17,0,0.5,0,0,0.3,17.8
2,2015-06-02 11:19:30,2015-06-02 11:27:56,1,.46,-73.971443176269531,40.758941650390625,1,N,-73.978538513183594,40.761909484863281,1,6.5,0,0.5,1,0,0.3,8.3
2,2015-06-02 11:19:31,2015-06-02 11:30:30,1,.87,-73.978111267089844,40.738433837890625,1,N,-73.990272521972656,40.745437622070313,1,8,0,0.5,2.2,0,0.3,11
2,2015-06-02 11:19:31,2015-06-02 11:39:02,1,2.13,-73.945892333984375,40.773529052734375,1,N,-73.971527099609375,40.760330200195312,1,13.5,0,0.5,2.86,0,0.3,17.16
1,2015-06-02 11:19:32,2015-06-02 11:32:49,1,1.40,-73.979087829589844,40.776771545410156,1,N,-73.982

In [55]:
df = pd.read_csv(filename)

In [56]:
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,2,2015-06-02 11:19:29,2015-06-02 11:47:52,1,1.63,-73.95443,40.764141,1,N,-73.974754,40.754093,2,17.0,0.0,0.5,0.0,0.0,0.3,17.8
1,2,2015-06-02 11:19:30,2015-06-02 11:27:56,1,0.46,-73.971443,40.758942,1,N,-73.978539,40.761909,1,6.5,0.0,0.5,1.0,0.0,0.3,8.3
2,2,2015-06-02 11:19:31,2015-06-02 11:30:30,1,0.87,-73.978111,40.738434,1,N,-73.990273,40.745438,1,8.0,0.0,0.5,2.2,0.0,0.3,11.0
3,2,2015-06-02 11:19:31,2015-06-02 11:39:02,1,2.13,-73.945892,40.773529,1,N,-73.971527,40.76033,1,13.5,0.0,0.5,2.86,0.0,0.3,17.16
4,1,2015-06-02 11:19:32,2015-06-02 11:32:49,1,1.4,-73.979088,40.776772,1,N,-73.982162,40.758999,2,9.5,0.0,0.5,0.0,0.0,0.3,10.3


In [57]:
df.describe()

Unnamed: 0,VendorID,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
count,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0
mean,1.516652,1.659466,3.158511,-73.014956,40.226521,1.045105,-73.054699,40.248644,1.407741,14.415892,0.118212,0.49745,1.818059,0.400433,0.29979,17.552472
std,0.499748,1.333306,4.037516,8.347871,4.599169,0.302132,8.186847,4.51052,0.501911,12.442624,0.214794,0.037667,2.634469,1.66517,0.010816,15.13799
min,1.0,0.0,0.0,-74.186302,0.0,1.0,-74.277367,0.0,1.0,-7.0,-0.5,-0.5,0.0,0.0,-0.3,-7.8
25%,1.0,1.0,1.0,-73.990997,40.738556,1.0,-73.990261,40.738478,1.0,7.0,0.0,0.5,0.0,0.0,0.3,8.8
50%,2.0,1.0,1.7,-73.979774,40.755909,1.0,-73.978256,40.75634,1.0,10.5,0.0,0.5,1.0,0.0,0.3,12.8
75%,2.0,2.0,3.3,-73.963001,40.770012,1.0,-73.961311,40.771044,2.0,17.0,0.0,0.5,2.46,0.0,0.3,19.8
max,2.0,6.0,64.6,0.0,41.064606,5.0,0.0,41.137344,4.0,250.0,1.0,0.5,42.05,70.0,0.3,252.35


In [58]:
df['trip_distance'].describe()

count    9999.000000
mean        3.158511
std         4.037516
min         0.000000
25%         1.000000
50%         1.700000
75%         3.300000
max        64.600000
Name: trip_distance, dtype: float64

# Exercise: Taxi data

1. Create a data frame from the taxi data.
2. How many trips went 0 miles? (`trip_distance`) How much did people pay, on average, for such trips?
3. How many trips had 0 passengers? (`passenger_count`) How much did people pay, and how far did they go?


In [61]:
df = pd.read_csv(filename)

In [65]:
(
    df
    .loc[df['trip_distance'] == 0]
    ['total_amount']
    .mean()
)

np.float64(31.581940298507465)

In [67]:
(
    df
    .loc[df['passenger_count'] == 0]
    [['total_amount', 'trip_distance']]
    .mean()
)

total_amount     25.57
trip_distance     4.60
dtype: float64

In [68]:
df.count()

VendorID                 9999
tpep_pickup_datetime     9999
tpep_dropoff_datetime    9999
passenger_count          9999
trip_distance            9999
pickup_longitude         9999
pickup_latitude          9999
RateCodeID               9999
store_and_fwd_flag       9999
dropoff_longitude        9999
dropoff_latitude         9999
payment_type             9999
fare_amount              9999
extra                    9999
mta_tax                  9999
tip_amount               9999
tolls_amount             9999
improvement_surcharge    9999
total_amount             9999
dtype: int64

In [69]:
df.shape

(9999, 19)

In [70]:
# number of rows
df.shape[0]

9999

In [71]:
len(df.index)

9999

In [72]:
df.dtypes

VendorID                   int64
tpep_pickup_datetime      object
tpep_dropoff_datetime     object
passenger_count            int64
trip_distance            float64
pickup_longitude         float64
pickup_latitude          float64
RateCodeID                 int64
store_and_fwd_flag        object
dropoff_longitude        float64
dropoff_latitude         float64
payment_type               int64
fare_amount              float64
extra                    float64
mta_tax                  float64
tip_amount               float64
tolls_amount             float64
improvement_surcharge    float64
total_amount             float64
dtype: object

In [73]:
help(pd.read_csv) 

Help on function read_csv in module pandas.io.parsers.readers:

read_csv(
    filepath_or_buffer: 'FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str]',
    *,
    sep: 'str | None | lib.NoDefault' = <no_default>,
    delimiter: 'str | None | lib.NoDefault' = None,
    header: "int | Sequence[int] | None | Literal['infer']" = 'infer',
    names: 'Sequence[Hashable] | None | lib.NoDefault' = <no_default>,
    index_col: 'IndexLabel | Literal[False] | None' = None,
    usecols: 'UsecolsArgType' = None,
    dtype: 'DtypeArg | None' = None,
    engine: 'CSVEngine | None' = None,
    converters: 'Mapping[Hashable, Callable] | None' = None,
    true_values: 'list | None' = None,
    false_values: 'list | None' = None,
    skipinitialspace: 'bool' = False,
    skiprows: 'list[int] | int | Callable[[Hashable], bool] | None' = None,
    skipfooter: 'int' = 0,
    nrows: 'int | None' = None,
    na_values: 'Hashable | Iterable[Hashable] | Mapping[Hashable, Iterable[Hashable]] | None' = None,
  

In [74]:
!cat /etc/passwd

##
# User Database
# 
# Note that this file is consulted directly only when the system is running
# in single-user mode.  At other times this information is provided by
# Open Directory.
#
# See the opendirectoryd(8) man page for additional information about
# Open Directory.
##
nobody:*:-2:-2:Unprivileged User:/var/empty:/usr/bin/false
root:*:0:0:System Administrator:/var/root:/bin/sh
daemon:*:1:1:System Services:/var/root:/usr/bin/false
_uucp:*:4:4:Unix to Unix Copy Protocol:/var/spool/uucp:/usr/sbin/uucico
_taskgated:*:13:13:Task Gate Daemon:/var/empty:/usr/bin/false
_networkd:*:24:24:Network Services:/var/networkd:/usr/bin/false
_installassistant:*:25:25:Install Assistant:/var/empty:/usr/bin/false
_lp:*:26:26:Printing Services:/var/spool/cups:/usr/bin/false
_postfix:*:27:27:Postfix Mail Server:/var/spool/postfix:/usr/bin/false
_scsd:*:31:31:Service Configuration Service:/var/empty:/usr/bin/false
_ces:*:32:32:Certificate Enrollment Service:/var/empty:/usr/bin/false
_appstore:*:33:33

In [78]:
pd.read_csv(
    '/etc/passwd',
    sep=':',
    comment='#',
    header=None,
    names=['username', 'pw', 'uid', 'gid', 'name', 'homedir', 'shell']

)

Unnamed: 0,username,pw,uid,gid,name,homedir,shell
0,nobody,*,-2,-2,Unprivileged User,/var/empty,/usr/bin/false
1,root,*,0,0,System Administrator,/var/root,/bin/sh
2,daemon,*,1,1,System Services,/var/root,/usr/bin/false
3,_uucp,*,4,4,Unix to Unix Copy Protocol,/var/spool/uucp,/usr/sbin/uucico
4,_taskgated,*,13,13,Task Gate Daemon,/var/empty,/usr/bin/false
...,...,...,...,...,...,...,...
123,_modelmanagerd,*,301,301,Model Manager,/var/db/modelmanagerd,/usr/bin/false
124,_reportsystemmemory,*,302,302,ReportSystemMemory,/var/empty,/usr/bin/false
125,_swtransparencyd,*,303,303,Software Transparency Services,/var/db/swtransparencyd,/usr/bin/false
126,_naturallanguaged,*,304,304,Natural Language Services,/var/db/com.apple.naturallanguaged,/usr/bin/false


In [79]:
help(pd.read_csv)

Help on function read_csv in module pandas.io.parsers.readers:

read_csv(
    filepath_or_buffer: 'FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str]',
    *,
    sep: 'str | None | lib.NoDefault' = <no_default>,
    delimiter: 'str | None | lib.NoDefault' = None,
    header: "int | Sequence[int] | None | Literal['infer']" = 'infer',
    names: 'Sequence[Hashable] | None | lib.NoDefault' = <no_default>,
    index_col: 'IndexLabel | Literal[False] | None' = None,
    usecols: 'UsecolsArgType' = None,
    dtype: 'DtypeArg | None' = None,
    engine: 'CSVEngine | None' = None,
    converters: 'Mapping[Hashable, Callable] | None' = None,
    true_values: 'list | None' = None,
    false_values: 'list | None' = None,
    skipinitialspace: 'bool' = False,
    skiprows: 'list[int] | int | Callable[[Hashable], bool] | None' = None,
    skipfooter: 'int' = 0,
    nrows: 'int | None' = None,
    na_values: 'Hashable | Iterable[Hashable] | Mapping[Hashable, Iterable[Hashable]] | None' = None,
  

In [80]:
!ls *.x*

zsh:1: no matches found: *.x*


In [81]:
filename = 'titanic3.xls'

df = pd.read_excel(filename)

df

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0000,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1,2,113781,151.5500,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,0,"Zabour, Miss. Hileni",female,14.5000,1,0,2665,14.4542,,C,,328.0,
1305,3,0,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C,,,
1306,3,0,"Zakarian, Mr. Mapriededer",male,26.5000,0,0,2656,7.2250,,C,,304.0,
1307,3,0,"Zakarian, Mr. Ortin",male,27.0000,0,0,2670,7.2250,,C,,,


In [82]:
df.dtypes

pclass         int64
survived       int64
name          object
sex           object
age          float64
sibsp          int64
parch          int64
ticket        object
fare         float64
cabin         object
embarked      object
boat          object
body         float64
home.dest     object
dtype: object

In [86]:
df = pd.read_csv('taxi.csv',
                usecols=['trip_distance', 'total_amount', 'passenger_count'])
df

Unnamed: 0,passenger_count,trip_distance,total_amount
0,1,1.63,17.80
1,1,0.46,8.30
2,1,0.87,11.00
3,1,2.13,17.16
4,1,1.40,10.30
...,...,...,...
9994,1,2.70,12.30
9995,1,4.50,20.30
9996,1,5.59,22.30
9997,6,1.54,7.80


In [87]:
df.mean()

passenger_count     1.659466
trip_distance       3.158511
total_amount       17.552472
dtype: float64

In [88]:
help(df.mean)

Help on method mean in module pandas.core.frame:

mean(
    axis: 'Axis | None' = 0,
    skipna: 'bool' = True,
    numeric_only: 'bool' = False,
    **kwargs
) method of pandas.core.frame.DataFrame instance
    Return the mean of the values over the requested axis.

    Parameters
    ----------
    axis : {index (0), columns (1)}
        Axis for the function to be applied on.
        For `Series` this parameter is unused and defaults to 0.

        For DataFrames, specifying ``axis=None`` will apply the aggregation
        across both axes.

        .. versionadded:: 2.0.0

    skipna : bool, default True
        Exclude NA/null values when computing the result.
    numeric_only : bool, default False
        Include only float, int, boolean columns. Not implemented for Series.

    **kwargs
        Additional keyword arguments to be passed to the function.

    Returns
    -------
    Series or scalar

                Examples
                --------
                >>> s = pd.Seri

In [89]:
df.mean(axis='rows')

passenger_count     1.659466
trip_distance       3.158511
total_amount       17.552472
dtype: float64

In [90]:
df.mean(axis='columns')

0        6.810000
1        3.253333
2        4.290000
3        6.763333
4        4.233333
          ...    
9994     5.333333
9995     8.600000
9996     9.630000
9997     5.113333
9998    11.183333
Length: 9999, dtype: float64

In [91]:
df = pd.read_excel('titanic3.xls')
df.mean()

TypeError: unsupported operand type(s) for +: 'int' and 'str'

In [92]:
df.mean(numeric_only=True)

pclass        2.294882
survived      0.381971
age          29.881135
sibsp         0.498854
parch         0.385027
fare         33.295479
body        160.809917
dtype: float64

In [93]:
len(dir(df))

455

In [94]:
df = pd.read_csv('taxi.csv')

df

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,2,2015-06-02 11:19:29,2015-06-02 11:47:52,1,1.63,-73.954430,40.764141,1,N,-73.974754,40.754093,2,17.0,0.0,0.5,0.00,0.0,0.3,17.80
1,2,2015-06-02 11:19:30,2015-06-02 11:27:56,1,0.46,-73.971443,40.758942,1,N,-73.978539,40.761909,1,6.5,0.0,0.5,1.00,0.0,0.3,8.30
2,2,2015-06-02 11:19:31,2015-06-02 11:30:30,1,0.87,-73.978111,40.738434,1,N,-73.990273,40.745438,1,8.0,0.0,0.5,2.20,0.0,0.3,11.00
3,2,2015-06-02 11:19:31,2015-06-02 11:39:02,1,2.13,-73.945892,40.773529,1,N,-73.971527,40.760330,1,13.5,0.0,0.5,2.86,0.0,0.3,17.16
4,1,2015-06-02 11:19:32,2015-06-02 11:32:49,1,1.40,-73.979088,40.776772,1,N,-73.982162,40.758999,2,9.5,0.0,0.5,0.00,0.0,0.3,10.30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9994,1,2015-06-01 00:12:59,2015-06-01 00:24:18,1,2.70,-73.947792,40.814972,1,N,-73.973358,40.783638,2,11.0,0.5,0.5,0.00,0.0,0.3,12.30
9995,1,2015-06-01 00:12:59,2015-06-01 00:28:16,1,4.50,-74.004066,40.747818,1,N,-73.953758,40.779285,1,16.0,0.5,0.5,3.00,0.0,0.3,20.30
9996,2,2015-06-01 00:13:00,2015-06-01 00:37:25,1,5.59,-73.994377,40.766102,1,N,-73.903206,40.750546,2,21.0,0.5,0.5,0.00,0.0,0.3,22.30
9997,2,2015-06-01 00:13:02,2015-06-01 00:19:10,6,1.54,-73.978302,40.748531,1,N,-73.989166,40.762852,2,6.5,0.5,0.5,0.00,0.0,0.3,7.80


In [95]:
df['passenger_count'].value_counts()

passenger_count
1    7207
2    1313
5     520
3     406
6     369
4     182
0       2
Name: count, dtype: int64

In [96]:
df['passenger_count'].value_counts() / 10000

passenger_count
1    0.7207
2    0.1313
5    0.0520
3    0.0406
6    0.0369
4    0.0182
0    0.0002
Name: count, dtype: float64

In [97]:
df['passenger_count'].value_counts(normalize=True)

passenger_count
1    0.720772
2    0.131313
5    0.052005
3    0.040604
6    0.036904
4    0.018202
0    0.000200
Name: proportion, dtype: float64

In [98]:
df.value_counts()

VendorID  tpep_pickup_datetime  tpep_dropoff_datetime  passenger_count  trip_distance  pickup_longitude  pickup_latitude  RateCodeID  store_and_fwd_flag  dropoff_longitude  dropoff_latitude  payment_type  fare_amount  extra  mta_tax  tip_amount  tolls_amount  improvement_surcharge  total_amount
1         2015-06-01 00:00:00   2015-06-01 00:06:12    1                1.00           -73.988739        40.756832        1           N                   -73.974701         40.757038         2             6.0          0.5    0.5      0.00        0.00          0.3                    7.30            1
2         2015-06-02 11:23:16   2015-06-02 11:25:01    1                0.00           -73.937614        40.758091        1           N                   -73.937546         40.758114         1             3.0          0.0    0.5      0.00        0.00          0.3                    3.80            1
          2015-06-02 11:23:12   2015-06-02 11:34:35    5                0.97           -73.981522     

# `.loc` -- how to use it?

1. `.loc[row_name]` -- get back one row from a data frame
2. `.loc[[row1, row2...]]` -- get back more than one row
3. `.loc[row1:row2]` -- get back the slice, all rows from row1 to row2, including row2
4. `.loc[boolean_index]` -- get back all rows where the boolean index is `True`

All of the above selects *rows*, because the first argument is the *row selector*. We can pass a second argument, a *column selector*.

5. `.loc[row_selector, column_name]` -- returns rows meeting that criteria + that column


In [101]:
df.loc[0]

VendorID                                   2
tpep_pickup_datetime     2015-06-02 11:19:29
tpep_dropoff_datetime    2015-06-02 11:47:52
passenger_count                            1
trip_distance                           1.63
pickup_longitude                   -73.95443
pickup_latitude                    40.764141
RateCodeID                                 1
store_and_fwd_flag                         N
dropoff_longitude                 -73.974754
dropoff_latitude                   40.754093
payment_type                               2
fare_amount                             17.0
extra                                    0.0
mta_tax                                  0.5
tip_amount                               0.0
tolls_amount                             0.0
improvement_surcharge                    0.3
total_amount                            17.8
Name: 0, dtype: object

In [102]:
df.loc[[0, 1, 2]]

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,2,2015-06-02 11:19:29,2015-06-02 11:47:52,1,1.63,-73.95443,40.764141,1,N,-73.974754,40.754093,2,17.0,0.0,0.5,0.0,0.0,0.3,17.8
1,2,2015-06-02 11:19:30,2015-06-02 11:27:56,1,0.46,-73.971443,40.758942,1,N,-73.978539,40.761909,1,6.5,0.0,0.5,1.0,0.0,0.3,8.3
2,2,2015-06-02 11:19:31,2015-06-02 11:30:30,1,0.87,-73.978111,40.738434,1,N,-73.990273,40.745438,1,8.0,0.0,0.5,2.2,0.0,0.3,11.0


In [103]:
df.loc[[0]]

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,2,2015-06-02 11:19:29,2015-06-02 11:47:52,1,1.63,-73.95443,40.764141,1,N,-73.974754,40.754093,2,17.0,0.0,0.5,0.0,0.0,0.3,17.8


In [104]:
df.loc[5000:5003]

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
5000,1,2015-06-02 11:33:24,2015-06-02 12:10:02,1,4.6,-73.994026,40.749916,1,N,-73.992226,40.730858,1,23.0,0.0,0.5,7.1,0.0,0.3,30.9
5001,2,2015-06-02 11:33:17,2015-06-02 11:51:09,1,2.27,-73.99221,40.746006,1,N,-73.998291,40.722954,1,13.0,0.0,0.5,1.0,0.0,0.3,14.8
5002,2,2015-06-02 11:33:24,2015-06-02 11:38:42,1,0.89,-73.995903,40.726147,1,N,-73.999367,40.733822,2,6.0,0.0,0.5,0.0,0.0,0.3,6.8
5003,2,2015-06-02 11:33:25,2015-06-02 11:57:10,1,1.33,-73.961464,40.764591,1,N,-73.978889,40.759987,1,14.5,0.0,0.5,3.06,0.0,0.3,18.36


In [106]:
df.loc[  df['passenger_count'] > 4 ]  

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
27,2,2015-06-02 11:19:54,2015-06-02 11:27:46,5,1.35,-73.983185,40.750263,1,N,-73.999237,40.742435,1,7.0,0.0,0.5,1.00,0.00,0.3,8.80
30,2,2015-06-02 11:19:55,2015-06-02 11:32:57,6,0.81,-73.999336,40.754444,1,N,-73.996078,40.748554,2,8.5,0.0,0.5,0.00,0.00,0.3,9.30
31,2,2015-06-02 11:19:56,2015-06-02 11:44:22,5,1.92,-73.989502,40.735802,1,N,-73.991837,40.755692,2,15.0,0.0,0.5,0.00,0.00,0.3,15.80
34,2,2015-06-02 11:19:58,2015-06-02 11:28:29,5,0.85,-74.006416,40.733501,1,N,-73.996292,40.725971,1,7.0,0.0,0.5,1.56,0.00,0.3,9.36
49,2,2015-06-02 11:20:08,2015-06-02 11:24:31,5,0.57,-73.965134,40.769096,1,N,-73.961830,40.773659,2,4.5,0.0,0.5,0.00,0.00,0.3,5.30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9960,2,2015-06-01 00:13:02,2015-06-01 00:40:35,5,8.95,-74.000099,40.733120,1,N,-73.876839,40.748112,2,28.5,0.5,0.5,0.00,5.54,0.3,35.34
9961,2,2015-06-01 00:13:05,2015-06-01 00:22:38,6,2.72,-73.923241,40.767677,1,N,-73.956253,40.747513,1,10.5,0.5,0.5,2.36,0.00,0.3,14.16
9963,2,2015-06-01 00:13:07,2015-06-01 00:21:14,6,1.52,-73.972778,40.750256,1,N,-73.989075,40.762695,2,8.0,0.5,0.5,0.00,0.00,0.3,9.30
9989,2,2015-06-01 00:13:41,2015-06-01 00:17:44,6,1.34,-74.005898,40.735851,1,N,-73.991318,40.748177,1,6.0,0.5,0.5,1.46,0.00,0.3,8.76


In [107]:
(
    df
    .loc[5000,
       'passenger_count']
)

np.int64(1)

In [108]:
(
    df
    .loc[5000:5010,
       'passenger_count']
)

5000    1
5001    1
5002    1
5003    1
5004    5
5005    1
5006    2
5007    1
5008    1
5009    3
5010    1
Name: passenger_count, dtype: int64

In [109]:
(
    df
    .loc[5000:5010,
       ['passenger_count', 'total_amount']]
)

Unnamed: 0,passenger_count,total_amount
5000,1,30.9
5001,1,14.8
5002,1,6.8
5003,1,18.36
5004,5,11.8
5005,1,21.8
5006,2,9.8
5007,1,14.8
5008,1,11.8
5009,3,8.16


In [113]:
(
    df
    .loc[5000:5010,
       'passenger_count':'pickup_latitude']
)

Unnamed: 0,passenger_count,trip_distance,pickup_longitude,pickup_latitude
5000,1,4.6,-73.994026,40.749916
5001,1,2.27,-73.99221,40.746006
5002,1,0.89,-73.995903,40.726147
5003,1,1.33,-73.961464,40.764591
5004,5,1.47,-73.976379,40.739849
5005,1,3.0,-73.994278,40.751232
5006,2,1.02,-73.953682,40.785248
5007,1,2.23,-73.973373,40.757881
5008,1,1.2,-73.962486,40.775639
5009,3,0.78,-73.993805,40.72813


In [114]:
df.loc[
    5000:5003   # row selector
]

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
5000,1,2015-06-02 11:33:24,2015-06-02 12:10:02,1,4.6,-73.994026,40.749916,1,N,-73.992226,40.730858,1,23.0,0.0,0.5,7.1,0.0,0.3,30.9
5001,2,2015-06-02 11:33:17,2015-06-02 11:51:09,1,2.27,-73.99221,40.746006,1,N,-73.998291,40.722954,1,13.0,0.0,0.5,1.0,0.0,0.3,14.8
5002,2,2015-06-02 11:33:24,2015-06-02 11:38:42,1,0.89,-73.995903,40.726147,1,N,-73.999367,40.733822,2,6.0,0.0,0.5,0.0,0.0,0.3,6.8
5003,2,2015-06-02 11:33:25,2015-06-02 11:57:10,1,1.33,-73.961464,40.764591,1,N,-73.978889,40.759987,1,14.5,0.0,0.5,3.06,0.0,0.3,18.36


In [115]:
df.loc[
    [5000, 5003, 5010]   # row selector
]

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
5000,1,2015-06-02 11:33:24,2015-06-02 12:10:02,1,4.6,-73.994026,40.749916,1,N,-73.992226,40.730858,1,23.0,0.0,0.5,7.1,0.0,0.3,30.9
5003,2,2015-06-02 11:33:25,2015-06-02 11:57:10,1,1.33,-73.961464,40.764591,1,N,-73.978889,40.759987,1,14.5,0.0,0.5,3.06,0.0,0.3,18.36
5010,2,2015-06-02 11:33:29,2015-06-02 11:52:28,1,2.12,-74.007118,40.727589,1,N,-73.985893,40.741196,1,13.0,0.0,0.5,3.45,0.0,0.3,17.25


In [117]:
df.loc[5, 'passenger_count'] = 10
df.head(6)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,2,2015-06-02 11:19:29,2015-06-02 11:47:52,1,1.63,-73.95443,40.764141,1,N,-73.974754,40.754093,2,17.0,0.0,0.5,0.0,0.0,0.3,17.8
1,2,2015-06-02 11:19:30,2015-06-02 11:27:56,1,0.46,-73.971443,40.758942,1,N,-73.978539,40.761909,1,6.5,0.0,0.5,1.0,0.0,0.3,8.3
2,2,2015-06-02 11:19:31,2015-06-02 11:30:30,1,0.87,-73.978111,40.738434,1,N,-73.990273,40.745438,1,8.0,0.0,0.5,2.2,0.0,0.3,11.0
3,2,2015-06-02 11:19:31,2015-06-02 11:39:02,1,2.13,-73.945892,40.773529,1,N,-73.971527,40.76033,1,13.5,0.0,0.5,2.86,0.0,0.3,17.16
4,1,2015-06-02 11:19:32,2015-06-02 11:32:49,1,1.4,-73.979088,40.776772,1,N,-73.982162,40.758999,2,9.5,0.0,0.5,0.0,0.0,0.3,10.3
5,1,2015-06-02 11:19:33,2015-06-02 11:28:48,10,1.4,-73.944641,40.779465,1,N,-73.961365,40.771561,1,8.0,0.0,0.5,1.75,0.0,0.3,10.55


In [118]:
df.loc[2:20, 'passenger_count'] = 10
df.head(6)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,2,2015-06-02 11:19:29,2015-06-02 11:47:52,1,1.63,-73.95443,40.764141,1,N,-73.974754,40.754093,2,17.0,0.0,0.5,0.0,0.0,0.3,17.8
1,2,2015-06-02 11:19:30,2015-06-02 11:27:56,1,0.46,-73.971443,40.758942,1,N,-73.978539,40.761909,1,6.5,0.0,0.5,1.0,0.0,0.3,8.3
2,2,2015-06-02 11:19:31,2015-06-02 11:30:30,10,0.87,-73.978111,40.738434,1,N,-73.990273,40.745438,1,8.0,0.0,0.5,2.2,0.0,0.3,11.0
3,2,2015-06-02 11:19:31,2015-06-02 11:39:02,10,2.13,-73.945892,40.773529,1,N,-73.971527,40.76033,1,13.5,0.0,0.5,2.86,0.0,0.3,17.16
4,1,2015-06-02 11:19:32,2015-06-02 11:32:49,10,1.4,-73.979088,40.776772,1,N,-73.982162,40.758999,2,9.5,0.0,0.5,0.0,0.0,0.3,10.3
5,1,2015-06-02 11:19:33,2015-06-02 11:28:48,10,1.4,-73.944641,40.779465,1,N,-73.961365,40.771561,1,8.0,0.0,0.5,1.75,0.0,0.3,10.55


In [119]:
df.loc[5]['passenger_count'] = 7

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[5]['passenger_count'] = 7


In [120]:
df.loc[5]

VendorID                                   1
tpep_pickup_datetime     2015-06-02 11:19:33
tpep_dropoff_datetime    2015-06-02 11:28:48
passenger_count                           10
trip_distance                            1.4
pickup_longitude                  -73.944641
pickup_latitude                    40.779465
RateCodeID                                 1
store_and_fwd_flag                         N
dropoff_longitude                 -73.961365
dropoff_latitude                   40.771561
payment_type                               1
fare_amount                              8.0
extra                                    0.0
mta_tax                                  0.5
tip_amount                              1.75
tolls_amount                             0.0
improvement_surcharge                    0.3
total_amount                           10.55
Name: 5, dtype: object

In [122]:
(
    df
    .loc[df['passenger_count'] > 5, 'trip_distance']
) = 999



In [123]:
(
    df
    .loc[df['passenger_count'] > 5, 'trip_distance']
) 

2       999.0
3       999.0
4       999.0
5       999.0
6       999.0
        ...  
9953    999.0
9961    999.0
9963    999.0
9989    999.0
9997    999.0
Name: trip_distance, Length: 388, dtype: float64

# Exercise: Two-argument `.loc`

1. Find the mean `trip_distance` and `total_amount` for trips 4,000-6,000.
2. Find mean `total_amount` for all trips where `trip_distance` is within the mean +/- 1 standard deviation.