In [24]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np

## Pandas series objects

In [4]:
obj = Series(range(3), index = ['a', 'b', 'c'])
obj

a    0
b    1
c    2
dtype: int64

In [5]:
obj.values

array([0, 1, 2])

In [6]:
obj['a']

0

In [7]:
'''boolean filtering similar to numpy'''
obj[obj > 1]

c    2
dtype: int64

In [8]:
('b' in obj), ('e' in obj)

(True, False)

In [9]:
sdata = {'Ohio': 3500, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

obj3 = Series(sdata)
obj3

Ohio       3500
Oregon    16000
Texas     71000
Utah       5000
dtype: int64

In [10]:
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = Series(sdata, index  = states)
obj4

California        NaN
Ohio           3500.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [11]:
obj4.isnull()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [12]:
obj4.notnull()

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [13]:
obj4.name = 'population'

In [14]:
obj4.index.name = 'state'

In [15]:
obj4

state
California        NaN
Ohio           3500.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

## Pandas DataFrame

In [16]:
data = { 'state': states,
        'year': [x for x in range(1900,1940, 10)],
        'pop': [1.5, 1.7, 3.6, 2.4,]
}

In [17]:
frame = pd.DataFrame(data)
frame

Unnamed: 0,pop,state,year
0,1.5,California,1900
1,1.7,Ohio,1910
2,3.6,Oregon,1920
3,2.4,Texas,1930


In [18]:
frame = pd.DataFrame(data, 
                     columns = ['year', 'state', 'pop'],
                     index = [ i for i in range(4)])
frame

Unnamed: 0,year,state,pop
0,1900,California,1.5
1,1910,Ohio,1.7
2,1920,Oregon,3.6
3,1930,Texas,2.4


In [19]:
frame['state'], frame.state

(0    California
 1          Ohio
 2        Oregon
 3         Texas
 Name: state, dtype: object, 0    California
 1          Ohio
 2        Oregon
 3         Texas
 Name: state, dtype: object)

In [20]:
frame.ix[3] #frame rows can be indexed by 'ix' method

year      1930
state    Texas
pop        2.4
Name: 3, dtype: object

In [21]:
frame['debt'] = [16, 17, 18, 19]
frame

Unnamed: 0,year,state,pop,debt
0,1900,California,1.5,16
1,1910,Ohio,1.7,17
2,1920,Oregon,3.6,18
3,1930,Texas,2.4,19


In [22]:
frame.name = 'state info'
frame.index.name = 'butt stuff'
frame

Unnamed: 0_level_0,year,state,pop,debt
butt stuff,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1900,California,1.5,16
1,1910,Ohio,1.7,17
2,1920,Oregon,3.6,18
3,1930,Texas,2.4,19


## Arithmetic Methods with fill values

In [27]:
df1 = pd.DataFrame(np.arange(12.).reshape(3,4), columns = list('abcd'))
df2 = pd.DataFrame(np.arange(20.).reshape(4,5), columns = list('abcde'))

In [28]:
df1+df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,11.0,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [30]:
df1.add(df2, fill_value = 0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,11.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [32]:
df1.reindex(columns = df2.columns, fill_value = 0)

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,0
1,4.0,5.0,6.0,7.0,0
2,8.0,9.0,10.0,11.0,0


## Operations between DataFrames and Series

In [34]:
arr = np.arange(12.).reshape(3,4)
arr


array([[  0.,   1.,   2.,   3.],
       [  4.,   5.,   6.,   7.],
       [  8.,   9.,  10.,  11.]])

In [35]:
arr[0]

array([ 0.,  1.,  2.,  3.])

In [36]:
arr - arr[0]

array([[ 0.,  0.,  0.,  0.],
       [ 4.,  4.,  4.,  4.],
       [ 8.,  8.,  8.,  8.]])

In [39]:
frame = DataFrame(np.arange(12.).reshape((4,3)), columns = list('bde'), 
                  index = ['Utah', 'Ohio', 'Texas', 'Oregon'])
series= frame.ix[0]

In [41]:
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [42]:
series

b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

In [43]:
frame - series

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


In [46]:
series2 = Series(range(3), index = list('bef'))
series2

b    0
e    1
f    2
dtype: int64

In [47]:
#example of Broadcasting
frame+series2

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


In [49]:
series3 = frame['d']
series3

Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64

## Function application and mapping

In [52]:
frame = DataFrame(np.random.randn(4,3), columns = list('bde'),
                 index = ['Utah', 'Ohio', 'Texas', 'Oregon'] )
frame

Unnamed: 0,b,d,e
Utah,0.935547,0.076322,1.751566
Ohio,2.171681,0.47812,-1.170501
Texas,0.212044,1.025712,-0.831452
Oregon,0.560626,-1.718811,0.091599


In [53]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,0.935547,0.076322,1.751566
Ohio,2.171681,0.47812,1.170501
Texas,0.212044,1.025712,0.831452
Oregon,0.560626,1.718811,0.091599


## Lambda Functions, apply method, applymap, map
 - applying a function on a 1D array to each column or row!

In [54]:
f = lambda x: x.max() - x.min()

In [55]:
frame.apply(f)

b    1.959638
d    2.744523
e    2.922067
dtype: float64

In [56]:
frame.apply(f, axis = 1)

Utah      1.675244
Ohio      3.342182
Texas     1.857164
Oregon    2.279437
dtype: float64

In [57]:
def f(x):
    ''''''
    return Series([x.min(), x.max()], index =  ['min', 'max']) 

In [58]:
frame.apply(f)

Unnamed: 0,b,d,e
min,0.212044,-1.718811,-1.170501
max,2.171681,1.025712,1.751566


In [62]:
format = lambda x : '%.2f' %x
frame

Unnamed: 0,b,d,e
Utah,0.935547,0.076322,1.751566
Ohio,2.171681,0.47812,-1.170501
Texas,0.212044,1.025712,-0.831452
Oregon,0.560626,-1.718811,0.091599


In [61]:
frame.applymap(format)

Unnamed: 0,b,d,e
Utah,0.94,0.08,1.75
Ohio,2.17,0.48,-1.17
Texas,0.21,1.03,-0.83
Oregon,0.56,-1.72,0.09


In [63]:
frame['e'].map(format)

Utah       1.75
Ohio      -1.17
Texas     -0.83
Oregon     0.09
Name: e, dtype: object

## list of summary statistics in pandas
 - count
 - describe
 - min, max
 - argmin, argmax
 - idxmin, idxmax
 - quantile
 - sum
 - mean
 - median
 - mad - mean absolute deviation from mean value
 - var
 - std
 - skew - sample skewness - 3rd moment of values
 - kurt - sample kurtosis - 4th moment of values

## Correlation and Covariance

In [81]:
!pip install pandas-datareader
!pip install requests_cache
import pandas_datareader.data as web
import datetime
import requests_cache

Collecting requests_cache
  Downloading requests_cache-0.4.13-py2.py3-none-any.whl
Installing collected packages: requests-cache
Successfully installed requests-cache-0.4.13


In [86]:
expire_after = datetime.timedelta(days=3)


session = requests_cache.CachedSession(cache_name='cache', backend='sqlite', expire_after=expire_after)

start = datetime.datetime(2010, 1, 1)

end = datetime.datetime(2013, 1, 27)

all_data = {}
for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']:
    all_data[ticker] = web.DataReader(ticker, 'yahoo', start, end, session = session) 
    
price = DataFrame({tic: data['Adj Close'] for tic, data in all_data.iteritems} )