# Agenda

- Strings
- Cleaning data
- Datetimes
- Grouping + pivot tables
- Join + merge
- Plotting

URL: https://python.lerner.co.il

In [1]:
!ls *.csv

burrito_current.csv	   languages.csv  titanic3.csv
celebrity_deaths_2016.csv  taxi.csv


In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('taxi.csv')

In [4]:
df.dtypes

VendorID                   int64
tpep_pickup_datetime      object
tpep_dropoff_datetime     object
passenger_count            int64
trip_distance            float64
pickup_longitude         float64
pickup_latitude          float64
RateCodeID                 int64
store_and_fwd_flag        object
dropoff_longitude        float64
dropoff_latitude         float64
payment_type               int64
fare_amount              float64
extra                    float64
mta_tax                  float64
tip_amount               float64
tolls_amount             float64
improvement_surcharge    float64
total_amount             float64
dtype: object

In [5]:
df.head(10)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,2,2015-06-02 11:19:29,2015-06-02 11:47:52,1,1.63,-73.95443,40.764141,1,N,-73.974754,40.754093,2,17.0,0.0,0.5,0.0,0.0,0.3,17.8
1,2,2015-06-02 11:19:30,2015-06-02 11:27:56,1,0.46,-73.971443,40.758942,1,N,-73.978539,40.761909,1,6.5,0.0,0.5,1.0,0.0,0.3,8.3
2,2,2015-06-02 11:19:31,2015-06-02 11:30:30,1,0.87,-73.978111,40.738434,1,N,-73.990273,40.745438,1,8.0,0.0,0.5,2.2,0.0,0.3,11.0
3,2,2015-06-02 11:19:31,2015-06-02 11:39:02,1,2.13,-73.945892,40.773529,1,N,-73.971527,40.76033,1,13.5,0.0,0.5,2.86,0.0,0.3,17.16
4,1,2015-06-02 11:19:32,2015-06-02 11:32:49,1,1.4,-73.979088,40.776772,1,N,-73.982162,40.758999,2,9.5,0.0,0.5,0.0,0.0,0.3,10.3
5,1,2015-06-02 11:19:33,2015-06-02 11:28:48,1,1.4,-73.944641,40.779465,1,N,-73.961365,40.771561,1,8.0,0.0,0.5,1.75,0.0,0.3,10.55
6,1,2015-06-02 11:19:34,2015-06-02 11:38:46,1,1.8,-73.992867,40.748211,1,N,-73.969772,40.748459,1,12.5,0.0,0.5,3.0,0.0,0.3,16.3
7,1,2015-06-02 11:19:35,2015-06-02 12:36:46,4,11.9,-73.863075,40.769253,1,N,-73.98671,40.761307,1,52.5,0.0,0.5,15.0,5.54,0.3,73.84
8,2,2015-06-02 11:19:36,2015-06-02 11:45:19,1,1.27,-73.991432,40.749306,1,N,-73.985062,40.759525,2,15.0,0.0,0.5,0.0,0.0,0.3,15.8
9,1,2015-06-02 11:19:38,2015-06-02 11:23:50,1,0.6,-73.970734,40.796207,1,N,-73.97747,40.789509,1,5.0,0.0,0.5,0.5,0.0,0.3,6.3


In [6]:
# .loc -- with the index

df.loc[2]

VendorID                                   2
tpep_pickup_datetime     2015-06-02 11:19:31
tpep_dropoff_datetime    2015-06-02 11:30:30
passenger_count                            1
trip_distance                           0.87
pickup_longitude                  -73.978111
pickup_latitude                    40.738434
RateCodeID                                 1
store_and_fwd_flag                         N
dropoff_longitude                 -73.990273
dropoff_latitude                   40.745438
payment_type                               1
fare_amount                              8.0
extra                                    0.0
mta_tax                                  0.5
tip_amount                               2.2
tolls_amount                             0.0
improvement_surcharge                    0.3
total_amount                            11.0
Name: 2, dtype: object

In [7]:
# .loc -- with fancy indexing

df.loc[ [2, 4]] 

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
2,2,2015-06-02 11:19:31,2015-06-02 11:30:30,1,0.87,-73.978111,40.738434,1,N,-73.990273,40.745438,1,8.0,0.0,0.5,2.2,0.0,0.3,11.0
4,1,2015-06-02 11:19:32,2015-06-02 11:32:49,1,1.4,-73.979088,40.776772,1,N,-73.982162,40.758999,2,9.5,0.0,0.5,0.0,0.0,0.3,10.3


In [8]:
# .loc -- (a) row selector, (b) column selector

df.loc[  
        [2, 4] ,   # these rows
        ['passenger_count', 'trip_distance', 'total_amount']   # these columns
]


Unnamed: 0,passenger_count,trip_distance,total_amount
2,1,0.87,11.0
4,1,1.4,10.3


In [9]:
# .loc witih a boolean index for rows

df.loc[
    df['passenger_count'] == 4     # use as a boolean index
]

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
7,1,2015-06-02 11:19:35,2015-06-02 12:36:46,4,11.90,-73.863075,40.769253,1,N,-73.986710,40.761307,1,52.5,0.0,0.5,15.00,5.54,0.3,73.84
107,2,2015-06-02 11:20:50,2015-06-02 11:33:48,4,1.48,-73.965858,40.758888,1,N,-73.965858,40.758888,1,10.0,0.0,0.5,2.16,0.00,0.3,12.96
128,2,2015-06-02 11:21:07,2015-06-02 11:44:10,4,3.46,-73.981262,40.749622,1,N,-74.010216,40.720074,1,16.0,0.0,0.5,3.36,0.00,0.3,20.16
130,2,2015-06-02 11:21:09,2015-06-02 11:24:48,4,0.63,-73.962616,40.772381,1,N,-73.957092,40.780083,2,4.5,0.0,0.5,0.00,0.00,0.3,5.30
135,1,2015-06-02 11:21:13,2015-06-02 11:42:13,4,6.20,-73.989975,40.750702,1,N,-74.014153,40.712116,2,22.5,0.0,0.5,0.00,0.00,0.3,23.30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9565,2,2015-06-01 00:10:46,2015-06-01 00:17:33,4,1.38,-73.986282,40.755833,1,N,-73.985001,40.768688,2,7.0,0.5,0.5,0.00,0.00,0.3,8.30
9642,1,2015-06-01 00:11:41,2015-06-01 00:36:27,4,6.70,-73.984970,40.736481,1,N,-73.947731,40.657990,1,23.5,0.5,0.5,4.95,0.00,0.3,29.75
9661,1,2015-06-01 00:11:17,2015-06-01 00:28:52,4,4.50,-73.955803,40.714054,1,N,-73.953705,40.670975,1,16.5,0.5,0.5,3.56,0.00,0.3,21.36
9744,1,2015-06-01 00:12:19,2015-06-01 00:19:43,4,1.50,-73.986771,40.721313,1,N,-73.983368,40.738445,2,7.5,0.5,0.5,0.00,0.00,0.3,8.80


In [10]:
df.loc[
    df['passenger_count'] == 4     # use as a boolean index 
    ,
    ['trip_distance', 'total_amount']
]

Unnamed: 0,trip_distance,total_amount
7,11.90,73.84
107,1.48,12.96
128,3.46,20.16
130,0.63,5.30
135,6.20,23.30
...,...,...
9565,1.38,8.30
9642,6.70,29.75
9661,4.50,21.36
9744,1.50,8.80
