###Summarizing and Computing Descriptive Statistics

In [2]:
from pandas import DataFrame, Series
import numpy as np
import pandas as pd
import sys

In [3]:
df = DataFrame([[1.4, np.nan], [7.1, -4.5],
[np.nan, np.nan], [0.75, -1.3]],
index=['a', 'b', 'c', 'd'],
columns=['one', 'two'])

In [4]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [5]:
df.sum()

one    9.25
two   -5.80
dtype: float64

In [6]:
df.sum(axis=1)

a    1.40
b    2.60
c     NaN
d   -0.55
dtype: float64

In [7]:
df.sum(axis=1, skipna=False)

a     NaN
b    2.60
c     NaN
d   -0.55
dtype: float64

In [11]:
df.idxmax()

one    b
two    d
dtype: object

In [8]:
df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [14]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [18]:
obj = Series(['a', 'a', 'b', 'c'] * 4)

In [19]:
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

###Correlation and Covariance

In [9]:
import pandas.io.data as web
all_data = {}
for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']:
    all_data[ticker] = web.get_data_google(ticker, '1/1/2013', '1/1/2014')


In [11]:
all_data['AAPL']

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013-01-02,79.12,79.29,77.38,78.43,140124866
2013-01-03,78.27,78.52,77.29,77.44,88240950
2013-01-04,76.71,76.95,75.12,75.29,148581860
2013-01-07,74.60,75.60,73.60,74.80,121038176
2013-01-08,75.60,75.98,74.46,75.04,114676751
2013-01-09,74.64,75.00,73.71,73.87,101899959
2013-01-10,75.51,75.53,73.65,74.79,150285296
2013-01-11,74.43,75.05,74.15,74.33,87688741
2013-01-14,71.81,72.50,71.22,71.68,183544396
2013-01-15,71.19,71.28,69.05,69.42,219192932


In [12]:
price = DataFrame({tic: data['Close'] for tic, data in all_data.items()})
volume = DataFrame({tic: data['Volume'] for tic, data in all_data.items()})

In [13]:
returns = price.pct_change()

In [15]:
dir(price)

['AAPL',
 'GOOG',
 'IBM',
 'MSFT',
 'T',
 '_AXIS_ALIASES',
 '_AXIS_IALIASES',
 '_AXIS_LEN',
 '_AXIS_NAMES',
 '_AXIS_NUMBERS',
 '_AXIS_ORDERS',
 '_AXIS_REVERSED',
 '_AXIS_SLICEMAP',
 '__abs__',
 '__add__',
 '__and__',
 '__array__',
 '__array_wrap__',
 '__bool__',
 '__bytes__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__div__',
 '__doc__',
 '__eq__',
 '__finalize__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__imul__',
 '__init__',
 '__invert__',
 '__ipow__',
 '__isub__',
 '__iter__',
 '__itruediv__',
 '__le__',
 '__len__',
 '__lt__',
 '__mod__',
 '__module__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__nonzero__',
 '__or__',
 '__pow__',
 '__radd__',
 '__rand__',
 '__rdiv__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rfloordiv__',
 '__rmod__',
 '__rmul__',
 '__ror__',
 '__rpow__',
 '__rsub__',
 '__rtruediv__',
 '__r

In [14]:
returns.tail()

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013-12-24,-0.004175,-0.002926,0.005433,0.012561
2013-12-26,-0.006658,0.00506,0.011625,0.009709
2013-12-27,-0.006827,0.000842,-0.001457,-0.004006
2013-12-30,-0.009874,-0.007984,0.007186,0.0
2013-12-31,0.011739,0.010123,0.006223,0.003218


In [34]:
returns.MSFT.corr(returns.IBM)

0.169609500416842

In [35]:
returns.MSFT.cov(returns.IBM)

3.0928670124405632e-05

In [36]:
returns.corr()

Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,1.0,0.108642,0.114287,0.070419
GOOG,0.108642,1.0,0.203534,0.22129
IBM,0.114287,0.203534,1.0,0.16961
MSFT,0.070419,0.22129,0.16961,1.0


In [37]:
returns.cov()

Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,0.000321,2.7e-05,2.4e-05,2e-05
GOOG,2.7e-05,0.00019,3.3e-05,4.8e-05
IBM,2.4e-05,3.3e-05,0.000136,3.1e-05
MSFT,2e-05,4.8e-05,3.1e-05,0.000244


In [38]:
returns.corrwith(returns.IBM)

AAPL    0.114287
GOOG    0.203534
IBM     1.000000
MSFT    0.169610
dtype: float64

In [39]:
returns.corrwith(volume)

AAPL   -0.165444
GOOG         NaN
IBM    -0.351866
MSFT   -0.116474
dtype: float64

###Unique Values, Value Counts

In [17]:
obj = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])

In [18]:
obj1 = Series(['c', 'a', 'd', 'a', 'a', 'b', 'e', 'c', 'f'])

In [19]:
df = DataFrame({'one':obj,'two':obj1})

In [20]:
df

Unnamed: 0,one,two
0,c,c
1,a,a
2,d,d
3,a,a
4,a,a
5,b,b
6,b,e
7,c,c
8,c,f


In [21]:
uniques = obj.unique()

In [22]:
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [24]:
df.one.unique()

array(['c', 'a', 'd', 'b'], dtype=object)

In [43]:
obj.value_counts()

c    3
a    3
b    2
d    1
dtype: int64

In [25]:
mask = obj.isin(['b', 'c'])

In [45]:
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [27]:
df.isin(['b','c'])

Unnamed: 0,one,two
0,True,True
1,False,False
2,False,False
3,False,False
4,False,False
5,True,True
6,True,False
7,True,True
8,True,False


###Handling Missing Data

In [28]:
string_data = Series(['aardvark', 'artichoke', np.nan, 'avocado'])

In [29]:
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [31]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [53]:
data = Series([1, np.nan, 3.5, np.nan, 7])

In [54]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [56]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [32]:
data = DataFrame([[1., 6.5, 3.], [1., np.nan,np.nan],
[np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])

In [37]:
cleaned = data.dropna()

In [38]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [39]:
cleaned

Unnamed: 0,0,1,2
0,1,6.5,3


In [62]:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [63]:
data[4] = np.nan

In [64]:
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [65]:
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


###Filling in Missing Data

In [50]:
data.fillna('A')

Unnamed: 0,0,1,2
0,1,6.5,3
1,1,A,A
2,A,A,A
3,A,6.5,3


In [45]:
data.fillna({1: 0.5, 2: -1})

Unnamed: 0,0,1,2
0,1.0,6.5,3
1,1.0,0.5,-1
2,,0.5,-1
3,,6.5,3


In [46]:
df = DataFrame(np.random.randn(6, 3))

In [47]:
df.ix[2:, 1] = np.nan; df.ix[4:, 2] = np.nan

In [48]:
df

Unnamed: 0,0,1,2
0,-0.010787,-0.10701,0.145274
1,1.12455,0.853446,-0.36765
2,-0.779427,,0.701738
3,0.486967,,0.136004
4,-1.222734,,
5,1.712014,,


In [49]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,-0.010787,-0.10701,0.145274
1,1.12455,0.853446,-0.36765
2,-0.779427,0.853446,0.701738
3,0.486967,0.853446,0.136004
4,-1.222734,0.853446,0.136004
5,1.712014,0.853446,0.136004


In [51]:
data = Series([1., np.nan, 3.5, np.nan, 7])

In [52]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

In [53]:
df.fillna(df.mean())

Unnamed: 0,0,1,2
0,-0.010787,-0.10701,0.145274
1,1.12455,0.853446,-0.36765
2,-0.779427,0.373218,0.701738
3,0.486967,0.373218,0.136004
4,-1.222734,0.373218,0.153842
5,1.712014,0.373218,0.153842


###Reading and Writing Data in Text Format

In [76]:
!cat ../../../data/ex1.csv

a,b,c,d,message
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo


In [77]:
df = pd.read_csv('../../../data/ex1.csv')

In [78]:
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [79]:
df = pd.read_table('../../../data/ex1.csv',sep=',')

In [80]:
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [89]:
pd.read_csv('../../../data/ex2.csv',header=None)

Unnamed: 0,0,1,2,3,4
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [88]:
pd.read_csv('../../../data/ex2.csv',names=['a', 'b', 'c', 'd', 'message'])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [85]:
names = ['a', 'b', 'c', 'd', 'message']

In [87]:
pd.read_csv('../../../data/ex2.csv', names=names, index_col='message')

Unnamed: 0_level_0,a,b,c,d
message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hello,1,2,3,4
world,5,6,7,8
foo,9,10,11,12


In [9]:
!cat ../../../data/ex3.txt

  A  B C
aaa -0.264438 -1.026059 -0.619500
bbb 0.927272 0.302904 -0.032399
ccc -0.264273 -0.386314 -0.217601
ddd -0.871858 -0.348382 1.100491


In [4]:
pd.read_table('../../../data/ex3.txt', sep='\s+')

Unnamed: 0,A,B,C
aaa,-0.264438,-1.026059,-0.6195
bbb,0.927272,0.302904,-0.032399
ccc,-0.264273,-0.386314,-0.217601
ddd,-0.871858,-0.348382,1.100491


In [8]:
!cat ../../../data/ex4.csv

#hey
a,b,c,d,message
# just wanted to make things more difficult for you
# who reads CSV files with computers, anyway?
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo


In [7]:
pd.read_csv('../../../data/ex4.csv', skiprows=[0, 2, 3])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [10]:
!cat ../../../data/ex5.csv

something,a,b,c,d,message
one,1,2,3,4,NA
two,5,6,,8,world
three,9,10,11,12,foo
In [864]: result = pd.read_csv('ch06/ex5.csv')


In [14]:
pd.read_csv('../../../data/ex5.csv')

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [16]:
pd.read_csv('../../../data/ex5.csv', na_values=['NULL'])

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [17]:
sentinels = {'message': ['foo', 'NA'], 'something': ['two']}

In [19]:
pd.read_csv('../../../data/ex5.csv', na_values=sentinels)

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,,5,6,,8,world
2,three,9,10,11.0,12,


In [21]:
pd.read_csv('../../../data/ex5.csv', nrows=2)

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world


###Writing Data Out to Text Format

In [22]:
data = pd.read_csv('../../../data/ex5.csv')

In [23]:
data

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [24]:
data.to_csv('../../../data/out.csv')

In [27]:
data.to_csv(sys.stdout, sep='|')

|something|a|b|c|d|message
0|one|1|2|3.0|4|
1|two|5|6||8|world
2|three|9|10|11.0|12|foo


In [28]:
data.to_csv(sys.stdout, na_rep='NULL')

,something,a,b,c,d,message
0,one,1,2,3.0,4,NULL
1,two,5,6,NULL,8,world
2,three,9,10,11.0,12,foo


In [29]:
data.to_csv(sys.stdout, index=False, header=False)

one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo


In [31]:
data.to_csv(sys.stdout, index=False, cols=['a', 'b', 'c'])

a,b,c
1,2,3.0
5,6,
9,10,11.0


In [35]:
dates = pd.date_range('1/1/2000', periods=7)

In [36]:
ts = Series(np.arange(7),index=dates)

In [39]:
ts.to_csv(sys.stdout)

2000-01-01,0
2000-01-02,1
2000-01-03,2
2000-01-04,3
2000-01-05,4
2000-01-06,5
2000-01-07,6


###Reading Microsoft Excel Files