# Agenda

1. More on loading CSV files (especially a number of useful options / keyword arguments)
2. Reading Excel files
3. Reading from URLs
4. Scraping Web pages

In [1]:
import pandas as pd

filename = '../data/taxi.csv'  

df = pd.read_csv(filename)

df

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,2,2015-06-02 11:19:29,2015-06-02 11:47:52,1,1.63,-73.954430,40.764141,1,N,-73.974754,40.754093,2,17.0,0.0,0.5,0.00,0.0,0.3,17.80
1,2,2015-06-02 11:19:30,2015-06-02 11:27:56,1,0.46,-73.971443,40.758942,1,N,-73.978539,40.761909,1,6.5,0.0,0.5,1.00,0.0,0.3,8.30
2,2,2015-06-02 11:19:31,2015-06-02 11:30:30,1,0.87,-73.978111,40.738434,1,N,-73.990273,40.745438,1,8.0,0.0,0.5,2.20,0.0,0.3,11.00
3,2,2015-06-02 11:19:31,2015-06-02 11:39:02,1,2.13,-73.945892,40.773529,1,N,-73.971527,40.760330,1,13.5,0.0,0.5,2.86,0.0,0.3,17.16
4,1,2015-06-02 11:19:32,2015-06-02 11:32:49,1,1.40,-73.979088,40.776772,1,N,-73.982162,40.758999,2,9.5,0.0,0.5,0.00,0.0,0.3,10.30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9994,1,2015-06-01 00:12:59,2015-06-01 00:24:18,1,2.70,-73.947792,40.814972,1,N,-73.973358,40.783638,2,11.0,0.5,0.5,0.00,0.0,0.3,12.30
9995,1,2015-06-01 00:12:59,2015-06-01 00:28:16,1,4.50,-74.004066,40.747818,1,N,-73.953758,40.779285,1,16.0,0.5,0.5,3.00,0.0,0.3,20.30
9996,2,2015-06-01 00:13:00,2015-06-01 00:37:25,1,5.59,-73.994377,40.766102,1,N,-73.903206,40.750546,2,21.0,0.5,0.5,0.00,0.0,0.3,22.30
9997,2,2015-06-01 00:13:02,2015-06-01 00:19:10,6,1.54,-73.978302,40.748531,1,N,-73.989166,40.762852,2,6.5,0.5,0.5,0.00,0.0,0.3,7.80


In [2]:
# read_csv has a huge number of options!

# to explore them, I'm going to create a very small CSV file
# we can always export a data frame using df.to_csv

In [3]:
import numpy as np
from pandas import Series, DataFrame

np.random.seed(0)
df = DataFrame(np.random.randint(0, 1000, [4,5]))
df

Unnamed: 0,0,1,2,3,4
0,684,559,629,192,835
1,763,707,359,9,723
2,277,754,804,599,70
3,472,600,396,314,705


In [4]:
df.to_csv('mydata.csv', header=None)   # don't write the header row

In [5]:
# ! at the start of a line means: Run the shell command (here, in Unix)

!cat mydata.csv

0,684,559,629,192,835
1,763,707,359,9,723
2,277,754,804,599,70
3,472,600,396,314,705


In [6]:
# let's load our data file into a data frame

df = pd.read_csv('mydata.csv')

In [7]:
df

Unnamed: 0,0,684,559,629,192,835
0,1,763,707,359,9,723
1,2,277,754,804,599,70
2,3,472,600,396,314,705


In [8]:
# if I want to tell Pandas *not* to read the first line as column names, but
# rather to treat it as data, just pass header=None

df = pd.read_csv('mydata.csv', header=None)
df

Unnamed: 0,0,1,2,3,4,5
0,0,684,559,629,192,835
1,1,763,707,359,9,723
2,2,277,754,804,599,70
3,3,472,600,396,314,705


In [None]:
# if you pass header=n, where n is an integer, then Pandas will ignore all lines
# up to line n in the file, then take line n
# as the header names.



# Useful options for `read_csv`

1. `header` -- either an integer, indicating on what line the headers are located, *or* `None`, in which case no headers are in the file.
2. `usecols` -- a list of integers (column indexes) or strings (column names) that you want to include when you read the data

In [9]:
# Another option: usecols, where we give a list of columns (either strings, if we have their names, or
# integers, referring to them by number starting at 0) that we want in the data frame

In [10]:
pd.read_csv('../data/taxi.csv', 
            usecols=['passenger_count',
                     'trip_distance',
                     'total_amount'])

Unnamed: 0,passenger_count,trip_distance,total_amount
0,1,1.63,17.80
1,1,0.46,8.30
2,1,0.87,11.00
3,1,2.13,17.16
4,1,1.40,10.30
...,...,...,...
9994,1,2.70,12.30
9995,1,4.50,20.30
9996,1,5.59,22.30
9997,6,1.54,7.80


In [12]:
pd.read_csv('mydata.csv',
            usecols=[1,3,4],
           header=None)

Unnamed: 0,1,3,4
0,684,629,192
1,763,359,9
2,277,804,599
3,472,396,314


In [13]:
# after I load the data frame into memory, I can assign a list of strings (or a series of strings) to df.columns
# and thus set the names

# but I can do that at load time by passing the "names" keyword argument

pd.read_csv('mydata.csv',
            usecols=[1,3,4],
           header=None,
           names=['first', 'third', 'fourth'])

Unnamed: 0,first,third,fourth
0,684,629,192
1,763,359,9
2,277,804,599
3,472,396,314
