# Agenda

1. More on loading CSV files (especially a number of useful options / keyword arguments)
2. Reading Excel files
3. Reading from URLs
4. Scraping Web pages

In [1]:
import pandas as pd

filename = '../data/taxi.csv'  

df = pd.read_csv(filename)

df

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,2,2015-06-02 11:19:29,2015-06-02 11:47:52,1,1.63,-73.954430,40.764141,1,N,-73.974754,40.754093,2,17.0,0.0,0.5,0.00,0.0,0.3,17.80
1,2,2015-06-02 11:19:30,2015-06-02 11:27:56,1,0.46,-73.971443,40.758942,1,N,-73.978539,40.761909,1,6.5,0.0,0.5,1.00,0.0,0.3,8.30
2,2,2015-06-02 11:19:31,2015-06-02 11:30:30,1,0.87,-73.978111,40.738434,1,N,-73.990273,40.745438,1,8.0,0.0,0.5,2.20,0.0,0.3,11.00
3,2,2015-06-02 11:19:31,2015-06-02 11:39:02,1,2.13,-73.945892,40.773529,1,N,-73.971527,40.760330,1,13.5,0.0,0.5,2.86,0.0,0.3,17.16
4,1,2015-06-02 11:19:32,2015-06-02 11:32:49,1,1.40,-73.979088,40.776772,1,N,-73.982162,40.758999,2,9.5,0.0,0.5,0.00,0.0,0.3,10.30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9994,1,2015-06-01 00:12:59,2015-06-01 00:24:18,1,2.70,-73.947792,40.814972,1,N,-73.973358,40.783638,2,11.0,0.5,0.5,0.00,0.0,0.3,12.30
9995,1,2015-06-01 00:12:59,2015-06-01 00:28:16,1,4.50,-74.004066,40.747818,1,N,-73.953758,40.779285,1,16.0,0.5,0.5,3.00,0.0,0.3,20.30
9996,2,2015-06-01 00:13:00,2015-06-01 00:37:25,1,5.59,-73.994377,40.766102,1,N,-73.903206,40.750546,2,21.0,0.5,0.5,0.00,0.0,0.3,22.30
9997,2,2015-06-01 00:13:02,2015-06-01 00:19:10,6,1.54,-73.978302,40.748531,1,N,-73.989166,40.762852,2,6.5,0.5,0.5,0.00,0.0,0.3,7.80


In [2]:
# read_csv has a huge number of options!

# to explore them, I'm going to create a very small CSV file
# we can always export a data frame using df.to_csv

In [3]:
import numpy as np
from pandas import Series, DataFrame

np.random.seed(0)
df = DataFrame(np.random.randint(0, 1000, [4,5]))
df

Unnamed: 0,0,1,2,3,4
0,684,559,629,192,835
1,763,707,359,9,723
2,277,754,804,599,70
3,472,600,396,314,705


In [4]:
df.to_csv('mydata.csv', header=None)   # don't write the header row

In [5]:
# ! at the start of a line means: Run the shell command (here, in Unix)

!cat mydata.csv

0,684,559,629,192,835
1,763,707,359,9,723
2,277,754,804,599,70
3,472,600,396,314,705


In [6]:
# let's load our data file into a data frame

df = pd.read_csv('mydata.csv')

In [7]:
df

Unnamed: 0,0,684,559,629,192,835
0,1,763,707,359,9,723
1,2,277,754,804,599,70
2,3,472,600,396,314,705


In [8]:
# if I want to tell Pandas *not* to read the first line as column names, but
# rather to treat it as data, just pass header=None

df = pd.read_csv('mydata.csv', header=None)
df

Unnamed: 0,0,1,2,3,4,5
0,0,684,559,629,192,835
1,1,763,707,359,9,723
2,2,277,754,804,599,70
3,3,472,600,396,314,705


In [None]:
# if you pass header=n, where n is an integer, then Pandas will ignore all lines
# up to line n in the file, then take line n
# as the header names.



# Useful options for `read_csv`

1. `header` -- either an integer, indicating on what line the headers are located, *or* `None`, in which case no headers are in the file.
2. `usecols` -- a list of integers (column indexes) or strings (column names) that you want to include when you read the data
3. `names` -- pass a list of strings, the names that we want to give to the columns that we've read
4. `index_col` -- the name (or numeric index) of a column you want to use for the data frame's index
5. `sep` -- a string, indicating what character(s) separate fields. Note that you cannot say "this or that," but whatever string you give here is taken literally, whether one character or longer.

In [9]:
# Another option: usecols, where we give a list of columns (either strings, if we have their names, or
# integers, referring to them by number starting at 0) that we want in the data frame

In [10]:
pd.read_csv('../data/taxi.csv', 
            usecols=['passenger_count',
                     'trip_distance',
                     'total_amount'])

Unnamed: 0,passenger_count,trip_distance,total_amount
0,1,1.63,17.80
1,1,0.46,8.30
2,1,0.87,11.00
3,1,2.13,17.16
4,1,1.40,10.30
...,...,...,...
9994,1,2.70,12.30
9995,1,4.50,20.30
9996,1,5.59,22.30
9997,6,1.54,7.80


In [12]:
pd.read_csv('mydata.csv',
            usecols=[1,3,4],
           header=None)

Unnamed: 0,1,3,4
0,684,629,192
1,763,359,9
2,277,804,599
3,472,396,314


In [13]:
# after I load the data frame into memory, I can assign a list of strings (or a series of strings) to df.columns
# and thus set the names

# but I can do that at load time by passing the "names" keyword argument

pd.read_csv('mydata.csv',
            usecols=[1,3,4],
           header=None,
           names=['first', 'third', 'fourth'])

Unnamed: 0,first,third,fourth
0,684,629,192
1,763,359,9
2,277,804,599
3,472,396,314


In [15]:
# what if, when loading the taxi data, I want the `tpep_pickup_datetime` column to be the index?
# we can always set a column to be the index with set_index. But we can do that as part of reading
# the data by passing the index_col keyword argument, giving it the name (or numeric index) of
# the column we want.

df = pd.read_csv('../data/taxi.csv',
            usecols=['tpep_pickup_datetime',
                     'passenger_count',
                     'total_amount',
                     'trip_distance'],
            index_col='tpep_pickup_datetime')
df                     

Unnamed: 0_level_0,passenger_count,trip_distance,total_amount
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-06-02 11:19:29,1,1.63,17.80
2015-06-02 11:19:30,1,0.46,8.30
2015-06-02 11:19:31,1,0.87,11.00
2015-06-02 11:19:31,1,2.13,17.16
2015-06-02 11:19:32,1,1.40,10.30
...,...,...,...
2015-06-01 00:12:59,1,2.70,12.30
2015-06-01 00:12:59,1,4.50,20.30
2015-06-01 00:13:00,1,5.59,22.30
2015-06-01 00:13:02,6,1.54,7.80


In [16]:
df.loc['2015-06-01 00:13:00']

Unnamed: 0_level_0,passenger_count,trip_distance,total_amount
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-06-01 00:13:00,1,0.3,5.3
2015-06-01 00:13:00,1,2.5,10.8
2015-06-01 00:13:00,6,0.86,5.8
2015-06-01 00:13:00,1,4.91,23.79
2015-06-01 00:13:00,1,12.56,58.34
2015-06-01 00:13:00,1,5.59,22.3


In [None]:
# we can even pass the "dtype" keyword argument, giving it a dictionary as a value, where the keys are
# column names and the values are dtypes we want to give those column names.



In [17]:
!head /etc/passwd

root:x:0:0:root:/root:/bin/bash
daemon:x:1:1:daemon:/usr/sbin:/usr/sbin/nologin
bin:x:2:2:bin:/bin:/usr/sbin/nologin
sys:x:3:3:sys:/dev:/usr/sbin/nologin
sync:x:4:65534:sync:/bin:/bin/sync
games:x:5:60:games:/usr/games:/usr/sbin/nologin
man:x:6:12:man:/var/cache/man:/usr/sbin/nologin
lp:x:7:7:lp:/var/spool/lpd:/usr/sbin/nologin
mail:x:8:8:mail:/var/mail:/usr/sbin/nologin
news:x:9:9:news:/var/spool/news:/usr/sbin/nologin


In [22]:
pd.read_csv('/etc/passwd',
           sep=':',
           header=None,
           names=['username', 'x', 'userid', 'groupid', 'userrealname', 'homedir', 'shell'],
           index_col='username')

Unnamed: 0_level_0,x,userid,groupid,userrealname,homedir,shell
username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
root,x,0,0,root,/root,/bin/bash
daemon,x,1,1,daemon,/usr/sbin,/usr/sbin/nologin
bin,x,2,2,bin,/bin,/usr/sbin/nologin
sys,x,3,3,sys,/dev,/usr/sbin/nologin
sync,x,4,65534,sync,/bin,/bin/sync
games,x,5,60,games,/usr/games,/usr/sbin/nologin
man,x,6,12,man,/var/cache/man,/usr/sbin/nologin
lp,x,7,7,lp,/var/spool/lpd,/usr/sbin/nologin
mail,x,8,8,mail,/var/mail,/usr/sbin/nologin
news,x,9,9,news,/var/spool/news,/usr/sbin/nologin


# Exercise: Airline data

There is a file on our system called `../data/airlines.dat`. Load this data into a data frame using `pd.read_csv`:

- We only want four columns: airline name (index 1), 2-character code (index 3), full name (index 5), and the country name (index 6)
- Name those columns
- Calculate/find out: What 10 countries have the most airlines? (Use `value_counts`) to establish this.)
- Try this: Set `sep` to be the wrong thing. What sort of error/weird output do you get?

In [31]:
filename = '../data/airlines.dat'

df = pd.read_csv(filename,
                header=None,
                usecols=[1, 3, 5, 6],
                names='name 2code longname country'.split())
df

Unnamed: 0,name,2code,longname,country
0,Private flight,-,,
1,135 Airways,,GENERAL,United States
2,1Time Airline,1T,NEXTIME,South Africa
3,2 Sqn No 1 Elementary Flying Training School,,,United Kingdom
4,213 Flight Unit,,,Russia
...,...,...,...,...
6043,Vuela Cuba,6C,,Cuba
6044,All Australia,88,,Australia
6045,Fly Europa,ER,,Spain
6046,FlyPortugal,PO,FlyPortugal,Portugal


In [34]:
df['country'].value_counts().head(15)

country
United States     1080
Mexico             439
United Kingdom     407
Canada             318
Russia             230
Spain              166
Germany            131
France             119
Australia           93
South Africa        91
Italy               90
Ukraine             89
Nigeria             85
Kazakhstan          79
Sweden              70
Name: count, dtype: int64

# Pandas can handle *many* types

If you want to see what formats you can read/write, it's often helpful in Jupyter to just write `pd.read_` and then press tab. You'll see all of the formats (some of which are more obvious than others).

In [36]:
# Excel

filename = '../data/titanic3.xls'

df = pd.read_excel(filename)
df

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0000,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1,2,113781,151.5500,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,0,"Zabour, Miss. Hileni",female,14.5000,1,0,2665,14.4542,,C,,328.0,
1305,3,0,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C,,,
1306,3,0,"Zakarian, Mr. Mapriededer",male,26.5000,0,0,2656,7.2250,,C,,304.0,
1307,3,0,"Zakarian, Mr. Ortin",male,27.0000,0,0,2670,7.2250,,C,,,


In [37]:
# what are the dtypes of the various columns?
df.dtypes

pclass         int64
survived       int64
name          object
sex           object
age          float64
sibsp          int64
parch          int64
ticket        object
fare         float64
cabin         object
embarked      object
boat          object
body         float64
home.dest     object
dtype: object

In [38]:
# what was the average age of people on the Titanic?

df['age'].mean()

29.8811345124283

In [42]:
# what was the average age of people on the Titanic who did not survive?

df.loc[ 
    df['survived'] == 0    # row selector
    ,
    'age'                   # column selector
].mean()

30.54536882067851

In [43]:
df['age'].median()

28.0

In [44]:
df.loc[ 
    df['survived'] == 0    # row selector
    ,
    'age'                   # column selector
].median()

28.0

# What argument can we give `read_*`?

All of the `read_*` methods in Pandas seem to take a filename as an argument. That's true, but that's only 1/3 of the truth. That first argument can be one of three things:

1. A string, containing a filename -- it can be absolute or relative
2. A file object, after running `open` on it. This can be convenient in some cases.
3. A string, containing a URL -- then Pandas downloads the file from that location and uses it to create the data frame

In [45]:
url = 'https://gist.githubusercontent.com/reuven/361d2c2b12dab426f4ed4efb396c89e5/raw/744dc0e9b193b53e3f76712cdfa32fa443440594/AAPL.csv'

pd.read_csv(url)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2018-04-09,169.880005,173.089996,169.850006,170.050003,170.050003,29017700
1,2018-04-10,173.0,174.0,171.529999,173.25,173.25,28408600
2,2018-04-11,172.229996,173.919998,171.699997,172.440002,172.440002,22431600
3,2018-04-12,173.410004,175.0,173.039993,174.139999,174.139999,22889300
4,2018-04-13,174.779999,175.839996,173.850006,174.729996,174.729996,25124300
5,2018-04-16,175.029999,176.190002,174.830002,175.820007,175.820007,21578400
6,2018-04-17,176.490005,178.940002,176.410004,178.240005,178.240005,26605400
7,2018-04-18,177.809998,178.820007,176.880005,177.839996,177.839996,20754500
8,2018-04-19,173.759995,175.389999,172.660004,172.800003,172.800003,34808800
9,2018-04-20,170.600006,171.220001,165.429993,165.720001,165.720001,65491100


In [50]:
df = pd.read_csv(url,
           usecols=['Date', 'Open', 'Close', 'Volume'],
           index_col='Date')
df

Unnamed: 0_level_0,Open,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-04-09,169.880005,170.050003,29017700
2018-04-10,173.0,173.25,28408600
2018-04-11,172.229996,172.440002,22431600
2018-04-12,173.410004,174.139999,22889300
2018-04-13,174.779999,174.729996,25124300
2018-04-16,175.029999,175.820007,21578400
2018-04-17,176.490005,178.240005,26605400
2018-04-18,177.809998,177.839996,20754500
2018-04-19,173.759995,172.800003,34808800
2018-04-20,170.600006,165.720001,65491100


In [51]:
df['diff'] = df['Close'] - df['Open']

In [52]:
df

Unnamed: 0_level_0,Open,Close,Volume,diff
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-04-09,169.880005,170.050003,29017700,0.169998
2018-04-10,173.0,173.25,28408600,0.25
2018-04-11,172.229996,172.440002,22431600,0.210006
2018-04-12,173.410004,174.139999,22889300,0.729995
2018-04-13,174.779999,174.729996,25124300,-0.050003
2018-04-16,175.029999,175.820007,21578400,0.790008
2018-04-17,176.490005,178.240005,26605400,1.75
2018-04-18,177.809998,177.839996,20754500,0.029998
2018-04-19,173.759995,172.800003,34808800,-0.959992
2018-04-20,170.600006,165.720001,65491100,-4.880005
