# Agenda

- Strings
- Cleaning data
- Datetimes
- Grouping + pivot tables
- Join + merge
- Plotting

URL: https://python.lerner.co.il

In [1]:
!ls *.csv

burrito_current.csv	   languages.csv  titanic3.csv
celebrity_deaths_2016.csv  taxi.csv


In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('taxi.csv')

In [4]:
df.dtypes

VendorID                   int64
tpep_pickup_datetime      object
tpep_dropoff_datetime     object
passenger_count            int64
trip_distance            float64
pickup_longitude         float64
pickup_latitude          float64
RateCodeID                 int64
store_and_fwd_flag        object
dropoff_longitude        float64
dropoff_latitude         float64
payment_type               int64
fare_amount              float64
extra                    float64
mta_tax                  float64
tip_amount               float64
tolls_amount             float64
improvement_surcharge    float64
total_amount             float64
dtype: object

In [5]:
df.head(10)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,2,2015-06-02 11:19:29,2015-06-02 11:47:52,1,1.63,-73.95443,40.764141,1,N,-73.974754,40.754093,2,17.0,0.0,0.5,0.0,0.0,0.3,17.8
1,2,2015-06-02 11:19:30,2015-06-02 11:27:56,1,0.46,-73.971443,40.758942,1,N,-73.978539,40.761909,1,6.5,0.0,0.5,1.0,0.0,0.3,8.3
2,2,2015-06-02 11:19:31,2015-06-02 11:30:30,1,0.87,-73.978111,40.738434,1,N,-73.990273,40.745438,1,8.0,0.0,0.5,2.2,0.0,0.3,11.0
3,2,2015-06-02 11:19:31,2015-06-02 11:39:02,1,2.13,-73.945892,40.773529,1,N,-73.971527,40.76033,1,13.5,0.0,0.5,2.86,0.0,0.3,17.16
4,1,2015-06-02 11:19:32,2015-06-02 11:32:49,1,1.4,-73.979088,40.776772,1,N,-73.982162,40.758999,2,9.5,0.0,0.5,0.0,0.0,0.3,10.3
5,1,2015-06-02 11:19:33,2015-06-02 11:28:48,1,1.4,-73.944641,40.779465,1,N,-73.961365,40.771561,1,8.0,0.0,0.5,1.75,0.0,0.3,10.55
6,1,2015-06-02 11:19:34,2015-06-02 11:38:46,1,1.8,-73.992867,40.748211,1,N,-73.969772,40.748459,1,12.5,0.0,0.5,3.0,0.0,0.3,16.3
7,1,2015-06-02 11:19:35,2015-06-02 12:36:46,4,11.9,-73.863075,40.769253,1,N,-73.98671,40.761307,1,52.5,0.0,0.5,15.0,5.54,0.3,73.84
8,2,2015-06-02 11:19:36,2015-06-02 11:45:19,1,1.27,-73.991432,40.749306,1,N,-73.985062,40.759525,2,15.0,0.0,0.5,0.0,0.0,0.3,15.8
9,1,2015-06-02 11:19:38,2015-06-02 11:23:50,1,0.6,-73.970734,40.796207,1,N,-73.97747,40.789509,1,5.0,0.0,0.5,0.5,0.0,0.3,6.3


In [6]:
# .loc -- with the index

df.loc[2]

VendorID                                   2
tpep_pickup_datetime     2015-06-02 11:19:31
tpep_dropoff_datetime    2015-06-02 11:30:30
passenger_count                            1
trip_distance                           0.87
pickup_longitude                  -73.978111
pickup_latitude                    40.738434
RateCodeID                                 1
store_and_fwd_flag                         N
dropoff_longitude                 -73.990273
dropoff_latitude                   40.745438
payment_type                               1
fare_amount                              8.0
extra                                    0.0
mta_tax                                  0.5
tip_amount                               2.2
tolls_amount                             0.0
improvement_surcharge                    0.3
total_amount                            11.0
Name: 2, dtype: object

In [7]:
# .loc -- with fancy indexing

df.loc[ [2, 4]] 

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
2,2,2015-06-02 11:19:31,2015-06-02 11:30:30,1,0.87,-73.978111,40.738434,1,N,-73.990273,40.745438,1,8.0,0.0,0.5,2.2,0.0,0.3,11.0
4,1,2015-06-02 11:19:32,2015-06-02 11:32:49,1,1.4,-73.979088,40.776772,1,N,-73.982162,40.758999,2,9.5,0.0,0.5,0.0,0.0,0.3,10.3


In [8]:
# .loc -- (a) row selector, (b) column selector

df.loc[  
        [2, 4] ,   # these rows
        ['passenger_count', 'trip_distance', 'total_amount']   # these columns
]


Unnamed: 0,passenger_count,trip_distance,total_amount
2,1,0.87,11.0
4,1,1.4,10.3


In [9]:
# .loc witih a boolean index for rows

df.loc[
    df['passenger_count'] == 4     # use as a boolean index
]

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
7,1,2015-06-02 11:19:35,2015-06-02 12:36:46,4,11.90,-73.863075,40.769253,1,N,-73.986710,40.761307,1,52.5,0.0,0.5,15.00,5.54,0.3,73.84
107,2,2015-06-02 11:20:50,2015-06-02 11:33:48,4,1.48,-73.965858,40.758888,1,N,-73.965858,40.758888,1,10.0,0.0,0.5,2.16,0.00,0.3,12.96
128,2,2015-06-02 11:21:07,2015-06-02 11:44:10,4,3.46,-73.981262,40.749622,1,N,-74.010216,40.720074,1,16.0,0.0,0.5,3.36,0.00,0.3,20.16
130,2,2015-06-02 11:21:09,2015-06-02 11:24:48,4,0.63,-73.962616,40.772381,1,N,-73.957092,40.780083,2,4.5,0.0,0.5,0.00,0.00,0.3,5.30
135,1,2015-06-02 11:21:13,2015-06-02 11:42:13,4,6.20,-73.989975,40.750702,1,N,-74.014153,40.712116,2,22.5,0.0,0.5,0.00,0.00,0.3,23.30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9565,2,2015-06-01 00:10:46,2015-06-01 00:17:33,4,1.38,-73.986282,40.755833,1,N,-73.985001,40.768688,2,7.0,0.5,0.5,0.00,0.00,0.3,8.30
9642,1,2015-06-01 00:11:41,2015-06-01 00:36:27,4,6.70,-73.984970,40.736481,1,N,-73.947731,40.657990,1,23.5,0.5,0.5,4.95,0.00,0.3,29.75
9661,1,2015-06-01 00:11:17,2015-06-01 00:28:52,4,4.50,-73.955803,40.714054,1,N,-73.953705,40.670975,1,16.5,0.5,0.5,3.56,0.00,0.3,21.36
9744,1,2015-06-01 00:12:19,2015-06-01 00:19:43,4,1.50,-73.986771,40.721313,1,N,-73.983368,40.738445,2,7.5,0.5,0.5,0.00,0.00,0.3,8.80


In [10]:
df.loc[
    df['passenger_count'] == 4     # use as a boolean index 
    ,
    ['trip_distance', 'total_amount']
]

Unnamed: 0,trip_distance,total_amount
7,11.90,73.84
107,1.48,12.96
128,3.46,20.16
130,0.63,5.30
135,6.20,23.30
...,...,...
9565,1.38,8.30
9642,6.70,29.75
9661,4.50,21.36
9744,1.50,8.80


In [11]:
# fillna, dropna -- deal with NaN values

In [12]:
from pandas import Series, DataFrame

In [13]:
s = Series('this is a bunch of words for my course'.split())

In [14]:
s

0      this
1        is
2         a
3     bunch
4        of
5     words
6       for
7        my
8    course
dtype: object

In [15]:
# what is the length of each word?

# .str accessor

s.len()

AttributeError: 'Series' object has no attribute 'len'

In [16]:
s.str.len()   # run len() on every value

0    4
1    2
2    1
3    5
4    2
5    5
6    3
7    2
8    6
dtype: int64

# What methods do we have?

- All Python string methods
- Operators such as `[]` are implemented as methods
- Other methods from other languages

In [19]:
# I want all of the words in s whose length is greater than the mean

s.str.len() > s.str.len().mean()

0     True
1    False
2    False
3     True
4    False
5     True
6    False
7    False
8     True
dtype: bool

In [20]:
s.loc[   s.str.len() > s.str.len().mean()     ]

0      this
3     bunch
5     words
8    course
dtype: object

In [27]:
# .str.get -- instead of []
# .str.contains -- instead of in

s.str.get(0).str.contains('a')

0    False
1    False
2     True
3    False
4    False
5    False
6    False
7    False
8    False
dtype: bool

In [28]:
# 
s.str.get(0).str.contains('[aeiou]', regex=True)

0    False
1     True
2     True
3    False
4     True
5    False
6    False
7    False
8    False
dtype: bool

In [29]:
s = Series('10 20 30 hello 40 50 goodbye'.split())
s

0         10
1         20
2         30
3      hello
4         40
5         50
6    goodbye
dtype: object

In [30]:
s.astype(int)

ValueError: invalid literal for int() with base 10: 'hello'

In [31]:
# find all int'able strings
# keep just them
# convert to ints

s.str.isdigit()

0     True
1     True
2     True
3    False
4     True
5     True
6    False
dtype: bool

In [33]:
s.loc[s.str.isdigit()].astype(int)

0    10
1    20
2    30
4    40
5    50
dtype: int64

In [36]:
s.loc[s.str.isdigit()].astype(int)

0    10
1    20
2    30
4    40
5    50
dtype: int64

In [38]:
# how long is our series?
len(s) #  -- how many are there?

7

In [40]:
s.count()   # how many non-NaN values are there?

7

In [41]:
# fastest answer comes from

len(s.index)

7

In [42]:
s

0         10
1         20
2         30
3      hello
4         40
5         50
6    goodbye
dtype: object

In [43]:
s.str.contains('e')

0    False
1    False
2    False
3     True
4    False
5    False
6     True
dtype: bool

# Exercise: Working with strings

1. Ask the user to enter a string containing both integers and non-integers, with words between them.
2. What is the mean length of the words that are not digits/numbers?
3. What is the longest word from those that are not digits/numbers?
4. What is the mean of the digits/numbers we entered?

In [44]:
text = input('Enter some numbers and words: ').strip()

s = Series(text.split())


Enter some numbers and words: this is a bunch of numbers 10 15 500 30 so there


In [45]:
s

0        this
1          is
2           a
3       bunch
4          of
5     numbers
6          10
7          15
8         500
9          30
10         so
11      there
dtype: object

In [54]:
# What is the mean length of the words that are not digits/numbers?

(
    s                        # start with series of words
    .loc[~s.str.isdigit()]   # keep only non-digit words
    .str.len()               # get length of each word
    .mean()                  # calculate mean on the series of lengths
)

3.5

In [57]:
# What is the longest word from those that are not digits/numbers?

max_word_length = (
    s                        # start with series of words
    .loc[~s.str.isdigit()]   # keep only non-digit words
    .str.len()               # get length of each word
    .max()                  # calculate mean on the series of lengths
)

s.loc[s.str.len() == max_word_length]

5    numbers
dtype: object

In [60]:
# What is the mean of the digits/numbers we entered?

(
    s                        # start with series of words
    .loc[s.str.isdigit()]    # keep only digit words
    .astype(int)             # get a series of ints
    .describe()
)

count      4.000000
mean     138.750000
std      240.983229
min       10.000000
25%       13.750000
50%       22.500000
75%      147.500000
max      500.000000
dtype: float64

In [61]:
import numpy as np

s = Series([10, 20, 30, np.nan, 50, 60])
s

0    10.0
1    20.0
2    30.0
3     NaN
4    50.0
5    60.0
dtype: float64

In [62]:
s.mean()

34.0

In [64]:
# s.count -- doesn't include NaN

s.sum() / s.count()

34.0

In [65]:
# get the mean, and use instead of NaN
# fillna 

s.fillna(9999)

0      10.0
1      20.0
2      30.0
3    9999.0
4      50.0
5      60.0
dtype: float64

In [66]:
s.fillna(s.mean())    # replace the NaN values with the mean

0    10.0
1    20.0
2    30.0
3    34.0
4    50.0
5    60.0
dtype: float64

In [67]:
s = Series([10, 20, np.nan, 40, 50, 60, 70, 80, np.nan, 100])
s.fillna(s.mean())

0     10.00
1     20.00
2     53.75
3     40.00
4     50.00
5     60.00
6     70.00
7     80.00
8     53.75
9    100.00
dtype: float64

In [68]:
# interpolation
# s.interpolate

s.interpolate()

0     10.0
1     20.0
2     30.0
3     40.0
4     50.0
5     60.0
6     70.0
7     80.0
8     90.0
9    100.0
dtype: float64

In [69]:
s = Series([10, 20, np.nan, np.nan, 50, 60, 70, 80, np.nan, 100])
s

0     10.0
1     20.0
2      NaN
3      NaN
4     50.0
5     60.0
6     70.0
7     80.0
8      NaN
9    100.0
dtype: float64

In [70]:
s.interpolate()

0     10.0
1     20.0
2     30.0
3     40.0
4     50.0
5     60.0
6     70.0
7     80.0
8     90.0
9    100.0
dtype: float64

In [73]:
df = DataFrame(np.random.randint(0, 100, [4, 5]),
              index=list('abcd'),
              columns=list('vwxyz'))
df.loc['a', 'x'] = np.nan
df.loc['a', 'z'] = np.nan
df.loc['b', 'y'] = np.nan
df.loc['c', 'z'] = np.nan
df

Unnamed: 0,v,w,x,y,z
a,20,96,,9.0,
b,92,33,26.0,,71.0
c,53,66,16.0,82.0,
d,75,28,59.0,78.0,14.0


In [74]:
# dropna removes any row with *ANY* nan
df.dropna()

Unnamed: 0,v,w,x,y,z
d,75,28,59.0,78.0,14.0


In [75]:
# thresh says: how many non-NaN values do we need?
df.dropna(thresh=3)

Unnamed: 0,v,w,x,y,z
a,20,96,,9.0,
b,92,33,26.0,,71.0
c,53,66,16.0,82.0,
d,75,28,59.0,78.0,14.0


In [76]:
df.dropna(thresh=4)

Unnamed: 0,v,w,x,y,z
b,92,33,26.0,,71.0
c,53,66,16.0,82.0,
d,75,28,59.0,78.0,14.0


In [77]:
# we only care about NaN in column z

df.dropna(subset=['z'])

Unnamed: 0,v,w,x,y,z
b,92,33,26.0,,71.0
d,75,28,59.0,78.0,14.0


In [79]:
s = Series('10a 20b 30c 40d 100e'.split())
s

0     10a
1     20b
2     30c
3     40d
4    100e
dtype: object

In [82]:
# text[start:end] is a slice in classic Python
# .str.slice does the same thing as a .str method

s.str.slice(0, -1)  # return from the start until (not including) the final character

0     10
1     20
2     30
3     40
4    100
dtype: object

In [83]:
(
    s
    .str.slice(0, -1)
    .astype(int)
)

0     10
1     20
2     30
3     40
4    100
dtype: int64

In [84]:
# what if we don't know how many letters there are, or where they are?

s = Series('ab10 c20 30 defg40hij'.split())
s

0         ab10
1          c20
2           30
3    defg40hij
dtype: object

In [87]:
s.str.replace(r'\D*(\d+)\D*', r'\1', regex=True)

0    10
1    20
2    30
3    40
dtype: object

In [89]:
import string

# remove any leading/trailing characters from ascii_lowercase
s.str.strip(string.ascii_lowercase)

0    10
1    20
2    30
3    40
dtype: object

In [90]:
s.str.strip('a')

0          b10
1          c20
2           30
3    defg40hij
dtype: object

In [91]:
string.ascii_lowercase

'abcdefghijklmnopqrstuvwxyz'

In [92]:
string.ascii_letters

'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'

In [93]:
help(s.str.extract)

Help on method extract in module pandas.core.strings.accessor:

extract(pat: 'str', flags: 'int' = 0, expand: 'bool' = True) -> 'DataFrame | Series | Index' method of pandas.core.strings.accessor.StringMethods instance
    Extract capture groups in the regex `pat` as columns in a DataFrame.

    For each subject string in the Series, extract groups from the
    first match of regular expression `pat`.

    Parameters
    ----------
    pat : str
        Regular expression pattern with capturing groups.
    flags : int, default 0 (no flags)
        Flags from the ``re`` module, e.g. ``re.IGNORECASE``, that
        modify regular expression matching for things like case,
        spaces, etc. For more details, see :mod:`re`.
    expand : bool, default True
        If True, return DataFrame with one column per capture group.
        If False, return a Series/Index if there is one capture group
        or DataFrame if there are multiple capture groups.

    Returns
    -------
    DataFrame

# Date and times

- Point in time -- `timestamp` or `datetime`
- Span of time -- `timedelta` or `interval`

### Date math

- `datetime` + `timedelta` = `datetime`
- `datetime` - `timedelta` = `datetime`
- `datetime` - `datetime` = `timedelta`

In [94]:
# how can we get datetime

df = pd.read_csv('taxi.csv')
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,2,2015-06-02 11:19:29,2015-06-02 11:47:52,1,1.63,-73.95443,40.764141,1,N,-73.974754,40.754093,2,17.0,0.0,0.5,0.0,0.0,0.3,17.8
1,2,2015-06-02 11:19:30,2015-06-02 11:27:56,1,0.46,-73.971443,40.758942,1,N,-73.978539,40.761909,1,6.5,0.0,0.5,1.0,0.0,0.3,8.3
2,2,2015-06-02 11:19:31,2015-06-02 11:30:30,1,0.87,-73.978111,40.738434,1,N,-73.990273,40.745438,1,8.0,0.0,0.5,2.2,0.0,0.3,11.0
3,2,2015-06-02 11:19:31,2015-06-02 11:39:02,1,2.13,-73.945892,40.773529,1,N,-73.971527,40.76033,1,13.5,0.0,0.5,2.86,0.0,0.3,17.16
4,1,2015-06-02 11:19:32,2015-06-02 11:32:49,1,1.4,-73.979088,40.776772,1,N,-73.982162,40.758999,2,9.5,0.0,0.5,0.0,0.0,0.3,10.3


In [95]:
df.dtypes 

VendorID                   int64
tpep_pickup_datetime      object
tpep_dropoff_datetime     object
passenger_count            int64
trip_distance            float64
pickup_longitude         float64
pickup_latitude          float64
RateCodeID                 int64
store_and_fwd_flag        object
dropoff_longitude        float64
dropoff_latitude         float64
payment_type               int64
fare_amount              float64
extra                    float64
mta_tax                  float64
tip_amount               float64
tolls_amount             float64
improvement_surcharge    float64
total_amount             float64
dtype: object

In [96]:
# pd.to_datetime takes a series of strings as input
# and returns a series of datetime

pd.to_datetime(df['tpep_pickup_datetime'])

0      2015-06-02 11:19:29
1      2015-06-02 11:19:30
2      2015-06-02 11:19:31
3      2015-06-02 11:19:31
4      2015-06-02 11:19:32
               ...        
9994   2015-06-01 00:12:59
9995   2015-06-01 00:12:59
9996   2015-06-01 00:13:00
9997   2015-06-01 00:13:02
9998   2015-06-01 00:13:04
Name: tpep_pickup_datetime, Length: 9999, dtype: datetime64[ns]

In [97]:
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])

In [98]:
df.dtypes

VendorID                          int64
tpep_pickup_datetime     datetime64[ns]
tpep_dropoff_datetime            object
passenger_count                   int64
trip_distance                   float64
pickup_longitude                float64
pickup_latitude                 float64
RateCodeID                        int64
store_and_fwd_flag               object
dropoff_longitude               float64
dropoff_latitude                float64
payment_type                      int64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
improvement_surcharge           float64
total_amount                    float64
dtype: object

In [99]:
df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])

In [100]:
df.dtypes

VendorID                          int64
tpep_pickup_datetime     datetime64[ns]
tpep_dropoff_datetime    datetime64[ns]
passenger_count                   int64
trip_distance                   float64
pickup_longitude                float64
pickup_latitude                 float64
RateCodeID                        int64
store_and_fwd_flag               object
dropoff_longitude               float64
dropoff_latitude                float64
payment_type                      int64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
improvement_surcharge           float64
total_amount                    float64
dtype: object

In [101]:
df = pd.read_csv('taxi.csv',
                 parse_dates=['tpep_pickup_datetime',
                             'tpep_dropoff_datetime'])
df.dtypes

VendorID                          int64
tpep_pickup_datetime     datetime64[ns]
tpep_dropoff_datetime    datetime64[ns]
passenger_count                   int64
trip_distance                   float64
pickup_longitude                float64
pickup_latitude                 float64
RateCodeID                        int64
store_and_fwd_flag               object
dropoff_longitude               float64
dropoff_latitude                float64
payment_type                      int64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
improvement_surcharge           float64
total_amount                    float64
dtype: object

In [104]:
# .dt is the accessor for datetime values

df['tpep_pickup_datetime'].dt.second

0       29
1       30
2       31
3       31
4       32
        ..
9994    59
9995    59
9996     0
9997     2
9998     4
Name: tpep_pickup_datetime, Length: 9999, dtype: int32

In [109]:
df['tpep_pickup_datetime'].dt.is_month_start

0       False
1       False
2       False
3       False
4       False
        ...  
9994     True
9995     True
9996     True
9997     True
9998     True
Name: tpep_pickup_datetime, Length: 9999, dtype: bool

In [110]:
df['tpep_pickup_datetime'].dt.day_of_week

0       1
1       1
2       1
3       1
4       1
       ..
9994    0
9995    0
9996    0
9997    0
9998    0
Name: tpep_pickup_datetime, Length: 9999, dtype: int32

In [111]:
df['tpep_pickup_datetime'].dt.day_name()

0       Tuesday
1       Tuesday
2       Tuesday
3       Tuesday
4       Tuesday
         ...   
9994     Monday
9995     Monday
9996     Monday
9997     Monday
9998     Monday
Name: tpep_pickup_datetime, Length: 9999, dtype: object

# Exercise: Dates and taxis

1. Load `taxi.csv` into a data frame, setting `tpep_pickup_datetime` and `tpep_dropoff_datetime` as `datetime` values.
2. What was the mean `total_amount` for trips that took place before 12:00? (From 0 - 12)
3. What was the mean `trip_distance` for trips that took place on Tuesday?

In [112]:
df = pd.read_csv('taxi.csv',
                parse_dates=['tpep_pickup_datetime',
                            'tpep_dropoff_datetime'],
                usecols=['tpep_pickup_datetime',
                        'tpep_dropoff_datetime',
                        'trip_distance', 'total_amount', 'passenger_count'])
df

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,total_amount
0,2015-06-02 11:19:29,2015-06-02 11:47:52,1,1.63,17.80
1,2015-06-02 11:19:30,2015-06-02 11:27:56,1,0.46,8.30
2,2015-06-02 11:19:31,2015-06-02 11:30:30,1,0.87,11.00
3,2015-06-02 11:19:31,2015-06-02 11:39:02,1,2.13,17.16
4,2015-06-02 11:19:32,2015-06-02 11:32:49,1,1.40,10.30
...,...,...,...,...,...
9994,2015-06-01 00:12:59,2015-06-01 00:24:18,1,2.70,12.30
9995,2015-06-01 00:12:59,2015-06-01 00:28:16,1,4.50,20.30
9996,2015-06-01 00:13:00,2015-06-01 00:37:25,1,5.59,22.30
9997,2015-06-01 00:13:02,2015-06-01 00:19:10,6,1.54,7.80


In [113]:
df.dtypes

tpep_pickup_datetime     datetime64[ns]
tpep_dropoff_datetime    datetime64[ns]
passenger_count                   int64
trip_distance                   float64
total_amount                    float64
dtype: object

In [119]:
# 2. What was the mean `total_amount` for trips that took place before 12:00? (From 0 - 12)

(
    df
    .loc[
        df['tpep_pickup_datetime'].dt.hour < 12,     # row selector
        'total_amount']       # column selector
    .mean()
)

17.420879297732263

In [120]:


(
    df
    .loc[
        df['tpep_pickup_datetime'].dt.hour >= 12,     # row selector
        'total_amount']       # column selector
    .mean()
)

17.836744627054365

In [125]:
# 3. What was the mean `trip_distance` for trips that took place on Tuesday?

(
    df
    .loc[
        df['tpep_pickup_datetime'].dt.day_name() == 'Tuesday' ,   # row selector
        'trip_distance'  # column selector
          ]
    .mean()
)

2.6611988171064604

In [127]:
df['total_amount'][df['tpep_pickup_datetime'].dt.hour < 12].mean()


17.420879297732263

In [128]:
df.head()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,total_amount
0,2015-06-02 11:19:29,2015-06-02 11:47:52,1,1.63,17.8
1,2015-06-02 11:19:30,2015-06-02 11:27:56,1,0.46,8.3
2,2015-06-02 11:19:31,2015-06-02 11:30:30,1,0.87,11.0
3,2015-06-02 11:19:31,2015-06-02 11:39:02,1,2.13,17.16
4,2015-06-02 11:19:32,2015-06-02 11:32:49,1,1.4,10.3


In [129]:
# how not to update things!
df.loc[2]['passenger_count'] = 2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[2]['passenger_count'] = 2
