# Introduction to Pandas

In [13]:
import pandas as pd

### Reading File from disk

In [4]:
df = pd.read_csv('parks.csv', index_col=['Park Code'])

In [6]:
df.head(3)

Unnamed: 0_level_0,Park Name,State,Acres,Latitude,Longitude
Park Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ACAD,Acadia National Park,ME,47390,44.35,-68.21
ARCH,Arches National Park,UT,76519,38.68,-109.57
BADL,Badlands National Park,SD,242756,43.75,-102.5


## Indexing

### Index single row with iloc (using row number)

In [16]:
df.iloc[1] # at position 1

Park Name    Arches National Park
State                          UT
Acres                       76519
Latitude                    38.68
Longitude                 -109.57
Name: ARCH, dtype: object

### Index single row with loc (using data frame index)

In [15]:
df.loc['ACAD'] # with index = 'ACAD'

Park Name    Acadia National Park
State                          ME
Acres                       47390
Latitude                    44.35
Longitude                  -68.21
Name: ACAD, dtype: object

### Index multiple rows with iloc (using row number)

In [21]:
df.iloc[[2, 1, 0]]

Unnamed: 0_level_0,Park Name,State,Acres,Latitude,Longitude
Park Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BADL,Badlands National Park,SD,242756,43.75,-102.5
ARCH,Arches National Park,UT,76519,38.68,-109.57
ACAD,Acadia National Park,ME,47390,44.35,-68.21


### Index multiple rows with loc (using data frame index)

In [22]:
df.loc[['BADL', 'ARCH', 'ACAD']]

Unnamed: 0_level_0,Park Name,State,Acres,Latitude,Longitude
Park Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BADL,Badlands National Park,SD,242756,43.75,-102.5
ARCH,Arches National Park,UT,76519,38.68,-109.57
ACAD,Acadia National Park,ME,47390,44.35,-68.21


In [28]:
df[:2] # indexing first 2 rows
df[2:] # indexing all rows starting from index 1
df[3:6] # indexing from row 3 to 5

Unnamed: 0_level_0,Park Name,State,Acres,Latitude,Longitude
Park Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BIBE,Big Bend National Park,TX,801163,29.25,-103.25
BISC,Biscayne National Park,FL,172924,25.65,-80.08
BLCA,Black Canyon of the Gunnison National Park,CO,32950,38.57,-107.72


### Indexing coloumns

In [33]:
df['State'].head(5)
df.State.head(5)

Park Code
ACAD    ME
ARCH    UT
BADL    SD
BIBE    TX
BISC    FL
Name: State, dtype: object

In [34]:
df.columns

Index(['Park Name', 'State', 'Acres', 'Latitude', 'Longitude'], dtype='object')

In [36]:
df.columns = [ col.replace(' ','_').lower() for col in df.columns]

In [37]:
df.columns

Index(['park_name', 'state', 'acres', 'latitude', 'longitude'], dtype='object')

### Indexing Columns and Rows

In [39]:
df[['state','acres']][:3]

Unnamed: 0_level_0,state,acres
Park Code,Unnamed: 1_level_1,Unnamed: 2_level_1
ACAD,ME,47390
ARCH,UT,76519
BADL,SD,242756


### Seleceting subset of data

In [42]:
(df.state == 'UT').head(3)

Park Code
ACAD    False
ARCH     True
BADL    False
Name: state, dtype: bool

In [45]:
df[df.state == 'UT']

Unnamed: 0_level_0,park_name,state,acres,latitude,longitude
Park Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ARCH,Arches National Park,UT,76519,38.68,-109.57
BRCA,Bryce Canyon National Park,UT,35835,37.57,-112.18
CANY,Canyonlands National Park,UT,337598,38.2,-109.93
CARE,Capitol Reef National Park,UT,241904,38.2,-111.17
ZION,Zion National Park,UT,146598,37.3,-113.05


#### ~ replaces not

#### | replaces or

#### & replaces and

In [46]:
df[(df.latitude > 60) | (df.acres > 10**6)].head(3)

Unnamed: 0_level_0,park_name,state,acres,latitude,longitude
Park Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
DENA,Denali National Park and Preserve,AK,3372402,63.33,-150.5
DEVA,Death Valley National Park,"CA, NV",4740912,36.24,-116.82
EVER,Everglades National Park,FL,1508538,25.32,-80.93


### Key Companion Methods: isin

In [48]:
df[df.state.isin(['WA', 'OR', 'CA'])].head()

Unnamed: 0_level_0,park_name,state,acres,latitude,longitude
Park Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CHIS,Channel Islands National Park,CA,249561,34.01,-119.42
CRLA,Crater Lake National Park,OR,183224,42.94,-122.1
JOTR,Joshua Tree National Park,CA,789745,33.79,-115.9
LAVO,Lassen Volcanic National Park,CA,106372,40.49,-121.51
MORA,Mount Rainier National Park,WA,235625,46.85,-121.75


### Describe command

In [49]:
df.describe()

Unnamed: 0,acres,latitude,longitude
count,56.0,56.0,56.0
mean,927929.1,41.233929,-113.234821
std,1709258.0,10.908831,22.440287
min,5550.0,19.38,-159.28
25%,69010.5,35.5275,-121.57
50%,238764.5,38.55,-110.985
75%,817360.2,46.88,-103.4
max,8323148.0,67.78,-68.21
