In [1]:
import pandas as pd

In [2]:
from pandas import Series, DataFrame

Series 
- 1-dim array-like obj, containing array of data
- any numpy data type
- associated array of data labels = index
Series is formed from only an array of data

In [4]:
obj = Series([4,7,-5,3])

In [5]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

string representation of a Series
- index on the left : default 0 ~ N-1
- values on the right
Array representation & index object of the Series
- get this via its values and index attributes


In [6]:
obj.values

array([ 4,  7, -5,  3])

In [7]:
obj.index

Int64Index([0, 1, 2, 3], dtype='int64')

In [8]:
obj2 = Series([4,7,-5,3], index=['d', 'b', 'a', 'c'])

In [9]:
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [10]:
obj2.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [11]:
obj2['a']

-5

In [12]:
obj2.a

-5

In [14]:
obj2[['c', 'a', 'd']]

c    3
a   -5
d    4
dtype: int64

In [20]:
bool1, bool2 = 'b' in obj2, 'e' in obj2
print(bool1, bool2)

True False


In [24]:
#numpy array operations:: 
#:: filtering with boolean array, scalar mul, applying math func
obj2[obj2 > 0] #show values that satisfy condition only
obj2 * 2       #multiply 2 to all values

d     8
b    14
a   -10
c     6
dtype: int64

In [26]:
import numpy as np

In [27]:
np.exp(obj2)

d      54.598150
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

In [28]:
#Python dict data type, create a Series from it by passing the dict
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah':5000}
obj3 = Series(sdata)
obj3

Ohio      35000
Oregon    16000
Texas     71000
Utah       5000
dtype: int64

In [35]:
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = Series(sdata, index=states)  
#index in the resulting Series will have the dict's keys in sorted order
#alphabetical order by dict keys

obj4

California      NaN
Ohio          35000
Oregon        16000
Texas         71000
dtype: float64

In [38]:
#NA for California -- missing or NAvalue in pandas
#isnull #notnull functions in pandas
pd.isnull(obj4)
pd.notnull(obj4)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [40]:
#Series ADVANTAGE: auto-align differently indexed data in arith ops
#Data Alignment (details later...)
obj3 + obj4

California       NaN
Ohio           70000
Oregon         32000
Texas         142000
Utah             NaN
dtype: float64

In [54]:
#Series object itself & its index have 'name' attribute
#integrate with other key areas of pandas functionality
obj4.name = 'population'
obj4.index.name = 'state'
obj4.index = ['CA', 'OH', 'OR', 'TX']
obj4[obj4[obj4.index] > 1000] 
#index gets each index, obj4 [~ ] is population value at each index
#if that value is not larger than 1000, exclude from obj4 and show

OH    35000
OR    16000
TX    71000
Name: population, dtype: float64

## DataFrame

tabular, spreadsheet-like data structure
ordered collection of columns
different value type (numeric, string, boolean, etc.)
DataFrame - row, column index :: think of it as a dict of Series

In [55]:
#ways to create DataFrame
#dict of equal-length lists or numpy arrays
data = {'state':['California', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
       'year':[2000, 2001, 2002, 2001, 2002],
       'pop':[1.5, 1.7, 3.6, 2.4, 2.9]}
frame = DataFrame(data)

In [56]:
frame
#index assigned automatically like Series, 
#columns are placed in sorted order p - s - y

Unnamed: 0,pop,state,year
0,1.5,California,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002


In [58]:
#reorder columns sequence
DataFrame(data, columns=['year', 'state', 'pop'])

Unnamed: 0,year,state,pop
0,2000,California,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9


In [59]:
#pass a column that isn't contained in data? -- NA values
frame2 = DataFrame(data, columns=['year', 'state', 'pop', 'debt'], 
                  index=['one', 'two', 'three', 'four', 'five'])

In [60]:
frame2
#index == row... 
#debt has no values at the moment

Unnamed: 0,year,state,pop,debt
one,2000,California,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,


In [61]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [62]:
#retrieving 'column' values by dict-like notation or by attribute
#can't apply this to index one two three...
frame2['state']

one      California
two            Ohio
three          Ohio
four         Nevada
five         Nevada
Name: state, dtype: object

In [63]:
frame2.year

one      2000
two      2001
three    2002
four     2001
five     2002
Name: year, dtype: int64

In [69]:
#Retrieving rows data
frame2.ix['three']
frame2.loc['two']

year     2001
state    Ohio
pop       1.7
debt      NaN
Name: two, dtype: object

In [70]:
###Columns can be modified by assignment. 
#debt column could be assigned a scalar value or an array of values
frame2['debt'] = 16.5
frame2

Unnamed: 0,year,state,pop,debt
one,2000,California,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5


In [71]:
frame2['debt'] = np.arange(5.)

In [72]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,California,1.5,0
two,2001,Ohio,1.7,1
three,2002,Ohio,3.6,2
four,2001,Nevada,2.4,3
five,2002,Nevada,2.9,4


In [80]:
# When assigning lists / arrays to a column, the value's length
## must match the length of the DataFrame
### If you assign Series, it will be instead conformed exactly 
#### to the DataFrame's index, inserting missing values in any holes
val = Series([-1.2, -1.5, -1.7], index=['two','four', 'five'])
frame2['debt'] = val

frame2
#you will see NaN values in unindicated indices, bc len(frame2) is 5

Unnamed: 0,year,state,pop,debt
one,2000,California,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7


In [83]:
#assign a non-exisiting column? --> it will create a new one
#del will delete columns
frame2['eastern'] = frame2.state == 'Ohio'
frame2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,California,1.5,,False
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False


In [84]:
del frame2['eastern']

In [85]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,California,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7


In [86]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [95]:
#create a common form of data: nested dict of dicts format
pop = {'Nevada': {2001:2.4, 2002: 2.9},
      'Ohio': {2001: 1.7, 2002: 3.6},
      'California': {2000: 1.5}}

In [97]:
#outer dict keys ==> columns
#inner keys ==> row indices
frame3 = DataFrame(pop)
frame3

Unnamed: 0,California,Nevada,Ohio
2000,1.5,,
2001,,2.4,1.7
2002,,2.9,3.6


In [94]:
frame3.T 
#transpose!

Unnamed: 0,2000,2001,2002
California,1.5,,
Nevada,,2.4,2.9
Ohio,,1.7,3.6


In [99]:
#inner dict keys are unioned and sorted to form the index
#but can be undone if explicit index is specified
DataFrame(pop, index=[2000, 2001, 2002, 2003])

Unnamed: 0,California,Nevada,Ohio
2000,1.5,,
2001,,2.4,1.7
2002,,2.9,3.6
2003,,,


In [119]:
#dict of series are pretty similar...
pdata = {'Ohio':frame3['Ohio'][:-1],
        'Nevada':frame3['Nevada'][:2],
        'California':frame3['California'][:2]}
frame3['Ohio'][:-1]
frame3['Nevada'][:2]
DataFrame(pdata)


Unnamed: 0_level_0,California,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000,1.5,,
2001,,2.4,1.7


In [120]:
#DataFrame Constructors : index, columns have 'name' attributes
frame3.index.name = 'year' 
frame3.columns.name ='state'
frame3

state,California,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000,1.5,,
2001,,2.4,1.7
2002,,2.9,3.6


In [121]:
#attribute: VALUES
# returns data that are contained in the DF as a 2D ndarray
frame3.values

array([[ 1.5,  nan,  nan],
       [ nan,  2.4,  1.7],
       [ nan,  2.9,  3.6]])

In [122]:
frame2.values

array([[2000, 'California', 1.5, nan],
       [2001, 'Ohio', 1.7, -1.2],
       [2002, 'Ohio', 3.6, nan],
       [2001, 'Nevada', 2.4, -1.5],
       [2002, 'Nevada', 2.9, -1.7]], dtype=object)

## Dict of Series == each value becomes COLUMN
## Dict of Dicts == each inner dict keys - row index, outer dict keys - column
---------------------------------------------------------------

## Index Objects

In [123]:
#pandas index objects :: hold the axis labels, other metadata
#any array or other sequence of labels used when constructing 
#a Series or DataFrame, is internally converted to an INDEX
obj = Series(range(3), index=['a','b','c'])
index = obj.index
index

Index(['a', 'b', 'c'], dtype='object')

In [127]:
obj.index[1:]
index[:-1]

Index(['a', 'b'], dtype='object')

In [129]:
#index objects are immutable, can't be modified
#index[1] = 'd'

In [165]:
index = pd.Index(np.arange(3))  #nums become index
obj2 = Series([1.5, -2.5, 0], index=index)
obj2.index is index


0    1.5
1   -2.5
2    0.0
dtype: float64

In [138]:
frame3

state,California,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000,1.5,,
2001,,2.4,1.7
2002,,2.9,3.6


In [139]:
'Ohio' in frame3.columns

True

In [140]:
2003 in frame3.index

False

## Essential Functionality

fundamental mechanics of interacting with the data 

In [142]:
#Reindexing - pandas objects : reindex
obj = Series([4.5, 7.2, -5.3, 3.6], index=['d','b','a','c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [198]:
obj.reindex(['a', 'b', 'c', 'd', 'e'])

Index(['d', 'b', 'a', 'c'], dtype='object')

In [144]:
obj.reindex(['a', 'b', 'c', 'd', 'e'], fill_value=0)

a   -5.3
b    7.2
c    3.6
d    4.5
e    0.0
dtype: float64

In [148]:
#time series data
#would prefer interpolation or filling of values when reindexing 
#method option! ffill : forward fills the values
obj3 = Series(['blue', 'purple', 'yellow'], index = [0,2,4])
obj3
obj3.reindex(range(6), method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [173]:
#reindex :: can alter row index/columns/or both.
#just a sequence, the rows are reindexed in the result
frame = DataFrame(np.arange(9).reshape((3,3)), index=['a','c','d'],
                 columns=['Ohio', 'Texas', 'California'])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [183]:
#reindex columns
states = ['Texas', 'Utah', 'California']
frame2.reindex(columns = states)
#frame2['California'] = DataFrame([2,10,5,9], index=['a', 'b', 'c', 'd'])
#frame2['California'] = 5
frame2

Unnamed: 0,Texas,Utah,California
a,1,,2
b,1,,2
c,4,,5
d,7,,8


In [188]:
frame2 = frame.reindex(['a', 'b', 'c', 'd'], method='ffill', columns=states)
frame2

Unnamed: 0,Texas,Utah,California
a,1,,2
b,1,,2
c,4,,5
d,7,,8


In [189]:
#frame2['Texas'] = {'a':4, 'b':5, 'c':5, 'd':10}
frame.ix[['a', 'b', 'c', 'd']]
frame.ix[['a', 'b', 'c', 'd'], states]

Unnamed: 0,Texas,Utah,California
a,1.0,,2.0
b,,,
c,4.0,,5.0
d,7.0,,8.0


In [191]:
frame2['Utah']

a   NaN
b   NaN
c   NaN
d   NaN
Name: Utah, dtype: float64

Reindex function arguments
- index
- method
- fill_value
- limit
- level
- copy

## skip Dropping entries p.125
## Indexing, Selection, Filtering

In [200]:
for i in range(len(frame2.columns)):
    print(frame2.columns[i])

Texas
Utah
California


In [203]:
#Indexing into a DataFr
dsame is for retrieving one or more columns
# either with a single value or sequence
frame2['California']
frame2[['Texas', 'Utah']]
# cannot do this with index/row names

Unnamed: 0,Texas,Utah
a,1,
b,1,
c,4,
d,7,


In [207]:
frame2.Utah = [1,2,3,4]
frame2

Unnamed: 0,Texas,Utah,California
a,1,1,2
b,1,2,2
c,4,3,5
d,7,4,8


In [210]:
f2 = frame2.T

In [212]:
f2[:2]
#until 2nd row

Unnamed: 0,a,b,c,d
Texas,1,1,4,7
Utah,1,2,3,4


In [216]:
f2[f2['b']>1]
#print the rows of which b values are larger than 1

Unnamed: 0,a,b,c,d
Utah,1,2,3,4
California,2,2,5,8


In [225]:
f2['a'] #single column
f2.ix['Utah'] #single row
f2.ix[:, 'b']  #single column values for all rows
f2.ix['Utah', 'b'] #select row, column

2

## Function Application and Mapping

# Apply

In [227]:
#NumPy ufuncs (element-wise array methods) work fine with pandas objects
rand_array=np.random.randn(4,3)

In [229]:
frame = DataFrame(rand_array, columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame

Unnamed: 0,b,d,e
Utah,0.577791,-0.282367,-0.58104
Ohio,1.439579,-0.500835,0.306604
Texas,1.380904,-0.0215,0.769284
Oregon,0.95562,-0.006982,-0.277047


In [230]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,0.577791,0.282367,0.58104
Ohio,1.439579,0.500835,0.306604
Texas,1.380904,0.0215,0.769284
Oregon,0.95562,0.006982,0.277047


## Applying a function on 1 dim arrays to each col or row
### apply method

In [238]:
f = lambda x: x.max() - x.min()
frame.apply(f, axis = 1 )
frame['max_min'] = frame.apply(f, axis = 1)
frame
#axis = 0 by default : row
#axis = 1 manually: column

Unnamed: 0,b,d,e,max_min
Utah,0.577791,-0.282367,-0.58104,1.73987
Ohio,1.439579,-0.500835,0.306604,2.441249
Texas,1.380904,-0.0215,0.769284,1.423904
Oregon,0.95562,-0.006982,-0.277047,1.509714


In [239]:
def f(x):
    return Series([x.min(), x.max()], index=['min', 'max'])
frame.apply(f)

Unnamed: 0,b,d,e,max_min
min,0.577791,-0.500835,-0.58104,1.423904
max,1.439579,-0.006982,0.769284,2.441249


### Element-wise Python functions can be used, too

In [241]:
format = lambda x: '%.2f' % x
frame.applymap(format)
#apply-map because Series has a map method for applying to every element

Unnamed: 0,b,d,e,max_min
Utah,0.58,-0.28,-0.58,1.74
Ohio,1.44,-0.5,0.31,2.44
Texas,1.38,-0.02,0.77,1.42
Oregon,0.96,-0.01,-0.28,1.51
