# Agenda: Real-world

1. Recap and Q&A
2. More about CSV
3. Sorting with data
4. Grouping
5. Pivot tables
6. Joining
7. Cleaning


In [3]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [4]:
df = DataFrame([[10, 20, 30, 40],
                [50, 60, 70, 80]],
               index=list('ab'),
               columns=list('wxyz'))

In [5]:
df

Unnamed: 0,w,x,y,z
a,10,20,30,40
b,50,60,70,80


In [9]:
# retrieve an entire row, use .loc

df.loc['b']     # this retrieves a row

w    50
x    60
y    70
z    80
Name: b, dtype: int64

In [10]:
df['x']    # this retrieves a column

a    20
b    60
Name: x, dtype: int64

In [11]:
df['y']   # so does this

a    30
b    70
Name: y, dtype: int64

In [12]:
# row selectors and column selectors with .loc

df.loc['b', 'y']  # row b and column y

70

In [13]:
# I want all of the elements from row b, columns x and y

df.loc['b', ['x', 'y']]

x    60
y    70
Name: b, dtype: int64

In [14]:
df

Unnamed: 0,w,x,y,z
a,10,20,30,40
b,50,60,70,80


In [15]:
df['w'] > 30

a    False
b     True
Name: w, dtype: bool

In [17]:
# wherever w is greater than 30,
# we want to see columns x and z

df.loc[
    df['w'] > 30,   # row selector -- a boolean series, which will act as a mask index
    ['x', 'z']       
    ]

Unnamed: 0,x,z
b,60,80


In [19]:
df = pd.read_csv('taxi.csv')
df.head()   # shows us the first five rows

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,2,2015-06-02 11:19:29,2015-06-02 11:47:52,1,1.63,-73.95443,40.764141,1,N,-73.974754,40.754093,2,17.0,0.0,0.5,0.0,0.0,0.3,17.8
1,2,2015-06-02 11:19:30,2015-06-02 11:27:56,1,0.46,-73.971443,40.758942,1,N,-73.978539,40.761909,1,6.5,0.0,0.5,1.0,0.0,0.3,8.3
2,2,2015-06-02 11:19:31,2015-06-02 11:30:30,1,0.87,-73.978111,40.738434,1,N,-73.990273,40.745438,1,8.0,0.0,0.5,2.2,0.0,0.3,11.0
3,2,2015-06-02 11:19:31,2015-06-02 11:39:02,1,2.13,-73.945892,40.773529,1,N,-73.971527,40.76033,1,13.5,0.0,0.5,2.86,0.0,0.3,17.16
4,1,2015-06-02 11:19:32,2015-06-02 11:32:49,1,1.4,-73.979088,40.776772,1,N,-73.982162,40.758999,2,9.5,0.0,0.5,0.0,0.0,0.3,10.3


In [21]:
df = pd.read_csv('taxi.csv', usecols=['passenger_count', 'trip_distance', 'total_amount'])
df.head()   # shows us the first five rows

Unnamed: 0,passenger_count,trip_distance,total_amount
0,1,1.63,17.8
1,1,0.46,8.3
2,1,0.87,11.0
3,1,2.13,17.16
4,1,1.4,10.3


In [23]:
# show me how much, on average, people paid and how far they traveled when the number of passengers was
# greater than 2

df.loc[
    df['passenger_count'] > 2,  # row selector 
    ['trip_distance', 'total_amount']  # column selector
].mean()

trip_distance     3.278653
total_amount     17.679939
dtype: float64

In [26]:
df = pd.read_csv('taxi.csv', header=None,
                usecols=[3, 4])  # the first row is *not* considered to be the headers (column names)

In [27]:
df.head()

Unnamed: 0,3,4
0,passenger_count,trip_distance
1,1,1.63
2,1,.46
3,1,.87
4,1,2.13


In [29]:
df = pd.read_csv('taxi.csv', header=None,
                usecols=[3, 4],
                names=['count', 'distance'])  # the first row is *not* considered to be the headers (column names)

In [30]:
df

Unnamed: 0,count,distance
0,passenger_count,trip_distance
1,1,1.63
2,1,.46
3,1,.87
4,1,2.13
...,...,...
9995,1,2.70
9996,1,4.50
9997,1,5.59
9998,6,1.54


In [31]:
# get all values in df other than those in row 0
df = df[1:]

In [32]:
df

Unnamed: 0,count,distance
1,1,1.63
2,1,.46
3,1,.87
4,1,2.13
5,1,1.40
...,...,...
9995,1,2.70
9996,1,4.50
9997,1,5.59
9998,6,1.54


In [33]:
# what happens if I now ask for the mean values of these two columns?
df.mean()

  df.mean()


count    inf
dtype: float64

In [34]:
df.dtypes  # what dtypes do we have in these two columns

count       object
distance    object
dtype: object

In [36]:
df['count'] = df.loc['count'].astype(np.int64)
df['distance'] = df.loc['distance'].astype(np.float64)


KeyError: 'count'

In [37]:
df.head()

Unnamed: 0,count,distance
1,1,1.63
2,1,0.46
3,1,0.87
4,1,2.13
5,1,1.4


In [38]:
df.dtypes

count         int64
distance    float64
dtype: object

In [None]:
df = pd.rea