# pandas

In [2]:
import pandas as pd
import numpy as np

## Series

In [2]:
s = pd.Series([4, 7, 5, 3])
s

0    4
1    7
2    5
3    3
dtype: int64

In [3]:
# the data

s.values

array([4, 7, 5, 3])

### the labels (index)

In [4]:
s.index

RangeIndex(start=0, stop=4, step=1)

In [5]:
# specifying index

s = pd.Series([4, -7, 5, 3], index=['apple', 'banana', 'cat', 'dog'])
s

apple     4
banana   -7
cat       5
dog       3
dtype: int64

In [6]:
s.index

Index(['apple', 'banana', 'cat', 'dog'], dtype='object')

In [7]:
s.values

array([ 4, -7,  5,  3])

In [8]:
# access element using index

s['banana']

-7

In [9]:
# access multiple elements

s[['banana', 'apple']]

banana   -7
apple     4
dtype: int64

In [10]:
# Applying operation on Series

s10 = s * 10
s10

apple     40
banana   -70
cat       50
dog       30
dtype: int64

In [11]:
# Selecting elements

s[ s > 0]

apple    4
cat      5
dog      3
dtype: int64

In [12]:
# check membership

'cat' in s

True

In [13]:
# creating a Series from a Python dictionary

s = pd.Series({'Ohio': 35000, 'Texas': 71000, 'Oregan': 16000})
s

Ohio      35000
Oregan    16000
Texas     71000
dtype: int64

### Automatic alignment

In [14]:
state1 = ['Ohio', 'Texas', 'Oregon']
s1 = pd.Series([100, 200, 300], index=state1)
s1

Ohio      100
Texas     200
Oregon    300
dtype: int64

In [15]:
state2 = ['Oregon', 'Ohio', 'California']
s2 = pd.Series([1000, 2000, 3000], index=state2)
s2

Oregon        1000
Ohio          2000
California    3000
dtype: int64

In [16]:
s1 + s2

California       NaN
Ohio          2100.0
Oregon        1300.0
Texas            NaN
dtype: float64

### Naming the series and the index

In [17]:
s1.name = 'Population'
s1.index.name = 'state'
s1

state
Ohio      100
Texas     200
Oregon    300
Name: Population, dtype: int64

# DataFrame

* A tabular, spreadsheet-like structure containing columns of different types
* Row and column indices

In [18]:
# specify each column separately

data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
        'year' : [2000, 2001, 2002, 2001, 2002],
        'pop'  : [1.5, 1.8, 3.4, 2.3, 2.8] }
df = pd.DataFrame(data)
df

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.8,Ohio,2001
2,3.4,Ohio,2002
3,2.3,Nevada,2001
4,2.8,Nevada,2002


In [19]:
# specify columns -- compare with the dataframe above

df = pd.DataFrame(data, columns=['year', 'state', 'pop'])
df

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.8
2,2002,Ohio,3.4
3,2001,Nevada,2.3
4,2002,Nevada,2.8


In [20]:
df = pd.DataFrame(data, columns=['year', 'state', 'popXXX'])
df

Unnamed: 0,year,state,popXXX
0,2000,Ohio,
1,2001,Ohio,
2,2002,Ohio,
3,2001,Nevada,
4,2002,Nevada,


In [95]:
df = pd.DataFrame(data, columns=['year', 'state', 'pop'], 
                  index=['one', 'two', 'three', 'four', 'five'])
df

Unnamed: 0,year,state,pop
one,2000,Ohio,1.5
two,2001,Ohio,1.8
three,2002,Ohio,3.4
four,2001,Nevada,2.3
five,2002,Nevada,2.8


In [22]:
df.columns

Index(['year', 'state', 'pop'], dtype='object')

In [23]:
df.index

Index(['one', 'two', 'three', 'four', 'five'], dtype='object')

## add a new column on some rows

In [96]:
val = pd.Series([-1, -2, -3], index=['two', 'three', 'five'])
#val = pd.Series([-1, -2, -3])
df['debt'] = val
df

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.8,-1.0
three,2002,Ohio,3.4,-2.0
four,2001,Nevada,2.3,
five,2002,Nevada,2.8,-3.0


## Modify column values

In [25]:
df['debt'] = -10
df

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,-10
two,2001,Ohio,1.8,-10
three,2002,Ohio,3.4,-10
four,2001,Nevada,2.3,-10
five,2002,Nevada,2.8,-10


In [26]:
df['debt'] = pd.Series([-5,-4,-3,-2,-1], index=['one','two','three','four','five'])
df

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,-5
two,2001,Ohio,1.8,-4
three,2002,Ohio,3.4,-3
four,2001,Nevada,2.3,-2
five,2002,Nevada,2.8,-1


## Delete a column

In [27]:
del df['debt']
df

Unnamed: 0,year,state,pop
one,2000,Ohio,1.5
two,2001,Ohio,1.8
three,2002,Ohio,3.4
four,2001,Nevada,2.3
five,2002,Nevada,2.8


## Nested dictionaries

The keys are the columns. The inner keys are the row indices.

In [28]:
d = {'California': {2001: 3.5, 2002: 4.6},
     'Oregon': {2000: 1.4, 2001: 1.3}}
df = pd.DataFrame(d)
df

Unnamed: 0,California,Oregon
2000,,1.4
2001,3.5,1.3
2002,4.6,


## Transpose the table

In [29]:
df.T

Unnamed: 0,2000,2001,2002
California,,3.5,4.6
Oregon,1.4,1.3,


## .values and .columns of a dataframe

In [30]:
df.values

array([[ nan,  1.4],
       [ 3.5,  1.3],
       [ 4.6,  nan]])

In [31]:
df.columns

Index(['California', 'Oregon'], dtype='object')

In [32]:
df.index.name = 'year'
df.columns.name = 'state'
df

state,California,Oregon
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.4
2001,3.5,1.3
2002,4.6,


# Reindexing Series

In [98]:
s = pd.Series([3,5,7,9], index=['c', 'b', 'a', 'd'])
s


c    3
b    5
a    7
d    9
dtype: int64

In [99]:
s.reindex(['a', 'b', 'c', 'd', 'e'])

a    7.0
b    5.0
c    3.0
d    9.0
e    NaN
dtype: float64

In [100]:
s.reindex(['a', 'b', 'c', 'd', 'e'], fill_value=0.)

a    7.0
b    5.0
c    3.0
d    9.0
e    0.0
dtype: float64

# Reindexing DataFrames

In [36]:
df = pd.DataFrame({'x': [0,3,6], 'y': [1,4,7], 'z':[2,5,8]}, index=['a','b','c'])
df

Unnamed: 0,x,y,z
a,0,1,2
b,3,4,5
c,6,7,8


## Reindexing rows

In [37]:
df.reindex(['c','a', 'b'])

Unnamed: 0,x,y,z
c,6,7,8
a,0,1,2
b,3,4,5


## Reindexing columns

In [38]:
df.reindex(columns=['z', 'x', 'y'])

Unnamed: 0,z,x,y
a,2,0,1
b,5,3,4
c,8,6,7


## Drop values from Series

In [39]:
s = pd.Series([3,5,7,9], index=['c', 'b', 'a', 'd'])
s

c    3
b    5
a    7
d    9
dtype: int64

In [40]:
s.drop(['a', 'd'])

c    3
b    5
dtype: int64

In [41]:
s2 = s.drop(['a', 'd'])
s2

c    3
b    5
dtype: int64

## Drop values from DataFrame

In [102]:
df = pd.DataFrame({'one': [8, 10], 'two': [9, 13], 'three': [11,15]}, index=['a','b'])
df

Unnamed: 0,one,three,two
a,8,11,9
b,10,15,13


In [103]:
df.drop(['b'], axis=0)

Unnamed: 0,one,three,two
a,8,11,9


In [105]:
df.drop(['one', 'three'], axis=1)

Unnamed: 0,two
a,9
b,13


# Indexing for Series

In [45]:
s = pd.Series([1,3,5,7,9], index=['a','b','c','d','e'])
s

a    1
b    3
c    5
d    7
e    9
dtype: int64

In [46]:
s[0:3]

a    1
b    3
c    5
dtype: int64

In [47]:
s[['a','c']]

a    1
c    5
dtype: int64

# Indexing for DataFrame

In [48]:
df = pd.DataFrame({'one':[1,2,3,4], 'two':[5,6,7,8]}, index=['a','b','c','d'])
df

Unnamed: 0,one,two
a,1,5
b,2,6
c,3,7
d,4,8


In [49]:
# select a column

df['two']

a    5
b    6
c    7
d    8
Name: two, dtype: int64

In [50]:
# select a row with index 'c'

df.loc['c']

one    3
two    7
Name: c, dtype: int64

In [51]:
# select a row with positional index "2"
df.iloc[2]

one    3
two    7
Name: c, dtype: int64

In [52]:
df.loc['c']['two']

7

# Index sorting

In [53]:
# Series

s = pd.Series([2,4,1,3], index=['b','d','a','c'])
s

b    2
d    4
a    1
c    3
dtype: int64

In [54]:
s.sort_index()

a    1
b    2
c    3
d    4
dtype: int64

In [55]:
# data frame

df = pd.DataFrame([[4,5,6,7],[0,1,2,3]],columns=['b','c','a','d'], index=['three', 'one'])
df

Unnamed: 0,b,c,a,d
three,4,5,6,7
one,0,1,2,3


In [56]:
df.sort_index(axis=0)

Unnamed: 0,b,c,a,d
one,0,1,2,3
three,4,5,6,7


In [57]:
df.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,6,4,5,7
one,2,0,1,3


# Summarizing statistics

In [58]:
df = pd.DataFrame([[1,2],[5,6],[3,6],[7,9]], columns=['one', 'two'], index=['a','b','c','d'])
df

Unnamed: 0,one,two
a,1,2
b,5,6
c,3,6
d,7,9


In [59]:
# column sum

df.sum()

one    16
two    23
dtype: int64

In [60]:
# row sum

df.sum(axis=1)

a     3
b    11
c     9
d    16
dtype: int64

In [61]:
# describe

df.describe()

Unnamed: 0,one,two
count,4.0,4.0
mean,4.0,5.75
std,2.581989,2.872281
min,1.0,2.0
25%,2.5,5.0
50%,4.0,6.0
75%,5.5,6.75
max,7.0,9.0


In [109]:
from numpy import nan as NA
s = pd.Series([1,5,3,6,7,8,4,NA])
s

0    1.0
1    5.0
2    3.0
3    6.0
4    7.0
5    8.0
6    4.0
7    NaN
dtype: float64

In [111]:
s.describe()

count    7.000000
mean     4.857143
std      2.410295
min      1.000000
25%      3.500000
50%      5.000000
75%      6.500000
max      8.000000
dtype: float64

In [64]:
s.describe()['mean']

4.8571428571428568

# Filtering out missing values

## Series

In [3]:
s1 = pd.Series([1,2,3], index=['b','c','a'])
s2 = pd.Series([4,2,5], index=['a','b','d'])
s3 = s1 + s2
s3

a    7.0
b    3.0
c    NaN
d    NaN
dtype: float64

In [4]:
# remove NaN

s3.dropna()

a    7.0
b    3.0
dtype: float64

## DataFrame

In [5]:
from numpy import nan as NA
df = pd.DataFrame([[1, 4, 3],[1,NA,NA],[NA,NA,NA],[NA,6.5, 4.]])
df

Unnamed: 0,0,1,2
0,1.0,4.0,3.0
1,1.0,,
2,,,
3,,6.5,4.0


In [6]:
df.dropna()

Unnamed: 0,0,1,2
0,1.0,4.0,3.0


In [7]:
df.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,4.0,3.0
1,1.0,,
3,,6.5,4.0


# Missing values

In [70]:
df

Unnamed: 0,0,1,2
0,1.0,4.0,3.0
1,1.0,,
2,,,
3,,6.5,4.0


In [71]:
# fill the NaN with zeros

df.fillna(0)

Unnamed: 0,0,1,2
0,1.0,4.0,3.0
1,1.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,6.5,4.0


In [72]:
# specify what value to fill in for each column

df.fillna({0: -1, 1: -10, 2: -20})

Unnamed: 0,0,1,2
0,1.0,4.0,3.0
1,1.0,-10.0,-20.0
2,-1.0,-10.0,-20.0
3,-1.0,6.5,4.0


# Hierarchical Indexing

## Series

In [73]:
s = pd.Series(np.random.randn(10), index=[['a','a','a','b','b','b','c','c','d','d'],
                                         [1,2,3,1,2,3,1,2,2,3]])
s

a  1   -0.555228
   2   -0.813057
   3   -0.411028
b  1    1.275525
   2   -1.413706
   3   -0.916066
c  1    0.453247
   2    0.591491
d  2   -0.545720
   3    0.288843
dtype: float64

In [74]:
s['b']

1    1.275525
2   -1.413706
3   -0.916066
dtype: float64

In [75]:
s[['b','d']]

b  1    1.275525
   2   -1.413706
   3   -0.916066
d  2   -0.545720
   3    0.288843
dtype: float64

In [76]:
# convert to a dataframe

s.unstack()

Unnamed: 0,1,2,3
a,-0.555228,-0.813057,-0.411028
b,1.275525,-1.413706,-0.916066
c,0.453247,0.591491,
d,,-0.54572,0.288843


In [77]:
s.unstack().stack()

a  1   -0.555228
   2   -0.813057
   3   -0.411028
b  1    1.275525
   2   -1.413706
   3   -0.916066
c  1    0.453247
   2    0.591491
d  2   -0.545720
   3    0.288843
dtype: float64

## DataFrame

Each axis can have a hierarchical index

In [78]:
df = pd.DataFrame(np.arange(12).reshape((4,3)), 
                    index=[['a','a','b','b'],[1,1,2,2,]],
                    columns=[['Ohio','Ohio','Colorado'],['Green','Red','Green']])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,1,3,4,5
b,2,6,7,8
b,2,9,10,11


In [79]:
df.index.names = ['key1', 'key2']
df.columns.names = ['state', 'color']
df

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,1,3,4,5
b,2,6,7,8
b,2,9,10,11


In [80]:
df['Ohio']

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,1,3,4
b,2,6,7
b,2,9,10


In [81]:
df.sum()

state     color
Ohio      Green    18
          Red      22
Colorado  Green    26
dtype: int64

In [82]:
df.sum(level='key1')

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
a,3,5,7
b,15,17,19


In [83]:
df.describe()

state,Ohio,Ohio,Colorado
color,Green,Red,Green
count,4.0,4.0,4.0
mean,4.5,5.5,6.5
std,3.872983,3.872983,3.872983
min,0.0,1.0,2.0
25%,2.25,3.25,4.25
50%,4.5,5.5,6.5
75%,6.75,7.75,8.75
max,9.0,10.0,11.0


# Loading Files

a.csv:
```   
a,b,c,d,message
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo
```

In [84]:
df = pd.read_csv('data/a.csv')
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


b.csv

```
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo
```

In [85]:
# csv without a header

df = pd.read_csv('data/b.csv')   # not good
df

Unnamed: 0,1,2,3,4,hello
0,5,6,7,8,world
1,9,10,11,12,foo


In [86]:
df = pd.read_csv('data/b.csv', header=None)
df

Unnamed: 0,0,1,2,3,4
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [87]:
df = pd.read_csv('data/b.csv', header=None, names=['a','b','c','d','e'])
df

Unnamed: 0,a,b,c,d,e
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [88]:
# use one of the columns to be row index

df = pd.read_csv('data/b.csv', header=None, names=['a','b','c','d','e'],
                index_col='e')
df

Unnamed: 0_level_0,a,b,c,d
e,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hello,1,2,3,4
world,5,6,7,8
foo,9,10,11,12


c.csv
```
key1,key2,value1,value2
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16
```

In [89]:
# pass multiple index_col to create multi-index data frame

df = pd.read_csv('data/c.csv', index_col=['key1','key2'])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,value1,value2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


# Other file formats

* JSON
* XML
* HTML
* Excel
* HDF5
* mySQL

# Groupby operation

In [112]:
df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                'key2' : ['one', 'two', 'one', 'two', 'one'],
                'data1' : np.random.randn(5),
                'data2' : np.random.randn(5)})
df

Unnamed: 0,data1,data2,key1,key2
0,-1.118751,-0.836633,a,one
1,0.203812,1.066399,a,two
2,-0.529777,1.404983,b,one
3,1.058585,0.056618,b,two
4,-0.07781,0.644145,a,one


In [114]:
g = df['data1'].groupby(df['key1'])
g

<pandas.core.groupby.SeriesGroupBy object at 0x114643240>

In [92]:
g.mean()

key1
a   -0.505419
b    0.323467
Name: data1, dtype: float64