In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
df = DataFrame([[1.4, np.nan], [7.1, -4.5],
   .....:                 [np.nan, np.nan], [0.75, -1.3]],
   .....:                index=['a', 'b', 'c', 'd'],
   .....:                columns=['one', 'two'])

In [3]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [4]:
df.sum()

one    9.25
two   -5.80
dtype: float64

In [9]:
df.sum(axis=1)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [6]:
df.mean()

one    3.083333
two   -2.900000
dtype: float64

In [7]:
df.mean(axis=1, skipna=False)

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [11]:
df.idxmax()

one    b
two    d
dtype: object

In [12]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [13]:
obj = Series(['a', 'a', 'b', 'c'] * 4)

In [14]:
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

# Correlation and Covariance

In [17]:
import pandas.io.data as web

In [22]:
all_data = {}
for ticker in ['AAPL', 'IBM', 'MSFT']:
    all_data[ticker] = web.get_data_yahoo(ticker, '1/1/2000', '1/1/2010')
    
price = DataFrame({tic: data['Adj Close']
                 for tic, data in all_data.items()})    
volume = DataFrame({tic: data['Volume']
                   for tic, data in all_data.items()})    

In [21]:
import pandas.io.data as web
all_data = {}
for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']:
    all_data[ticker] = web.get_data_yahoo(ticker, '1/1/2000', '1/1/2010')
price = DataFrame({tic: data['Adj Close']
                   for tic, data in all_data.iteritems()})
volume = DataFrame({tic: data['Volume']
                    for tic, data in all_data.iteritems()})

OSError: after 3 tries, Yahoo! did not return a 200 for url 'http://ichart.finance.yahoo.com/table.csv?s=GOOG&a=0&b=1&c=2000&d=0&e=1&f=2010&g=d&ignore=.csv'

In [25]:
returns = price.pct_change() # percent changes of the prices

In [24]:
returns.tail()

Unnamed: 0_level_0,AAPL,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2009-12-24,0.034339,0.004385,0.002587
2009-12-28,0.012294,0.013326,0.005484
2009-12-29,-0.011861,-0.003477,0.007058
2009-12-30,0.012147,0.005461,-0.013699
2009-12-31,-0.0043,-0.012597,-0.015504


### corr and cov method of Series 

In [28]:
returns.MSFT.corr(returns.IBM)

0.49597967981065422

In [27]:
returns.MSFT.cov(returns.IBM)

0.00021595763987754347

## DataFrame corr & cov method

In [29]:
returns.corr()

Unnamed: 0,AAPL,IBM,MSFT
AAPL,1.0,0.410011,0.424305
IBM,0.410011,1.0,0.49598
MSFT,0.424305,0.49598,1.0


In [30]:
returns.cov()

Unnamed: 0,AAPL,IBM,MSFT
AAPL,0.001027,0.000252,0.000309
IBM,0.000252,0.000367,0.000216
MSFT,0.000309,0.000216,0.000516


### corrwith method: pairwise correlations between Dataframe's columns or rows with another Series or DataFrame

In [31]:
returns.corrwith(returns.IBM)

AAPL    0.410011
IBM     1.000000
MSFT    0.495980
dtype: float64

In [32]:
returns.corrwith(volume)

AAPL   -0.057549
IBM    -0.007892
MSFT   -0.014245
dtype: float64

## Unique Values, Value Counts, and Membership 

In [33]:
 obj = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])

In [34]:
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [35]:
uniques = obj.unique()

In [36]:
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [45]:
sorted(uniques)

['a', 'b', 'c', 'd']

In [46]:
uniques.sort()

In [47]:
uniques.sort

<function ndarray.sort>

In [48]:
obj.value_counts()

a    3
c    3
b    2
d    1
dtype: int64

#### pandas method

In [49]:
pd.value_counts(obj.values, sort=False)

c    3
b    2
d    1
a    3
dtype: int64

In [50]:
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [55]:
mask = obj.isin(['b', 'c'])

In [58]:
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [57]:
obj[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object

### compute histogram on multiple columns

In [2]:
data = DataFrame({'Qu1': [1, 3, 4, 3, 4],
   .....:                   'Qu2': [2, 3, 1, 2, 3],
   .....:                   'Qu3': [1, 5, 2, 4, 4]})

In [61]:
data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [63]:
result = data.apply(pd.value_counts).fillna(0)

In [64]:
result

Unnamed: 0,Qu1,Qu2,Qu3
1,1,1,1
2,0,2,1
3,2,2,0
4,2,0,2
5,0,0,1


In [78]:
col1 = Series(result['Qu1'])
col1

1    1
2    0
3    2
4    2
5    0
Name: Qu1, dtype: float64

In [79]:
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [8]:
data.ix[:4,1]

0    2
1    3
2    1
3    2
4    3
Name: Qu2, dtype: int64

In [12]:
frame = DataFrame(np.arange(12).reshape((4, 3)),
   .....:                   index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
   .....:                   columns=[['Ohio', 'Ohio', 'Colorado'],
   .....:                            ['Green', 'Red', 'Green']])

In [10]:
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11
