In [1]:
import numpy as np
from pandas import DataFrame,Series

# 数据对象 索引对象

### 重新指定索引及顺序

In [2]:
obj = Series([4.5, 7.2, -5.3, 3.6], index = ['d', 'b', 'a', 'c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [3]:
obj2 = obj.reindex(['a', 'b', 'd', 'c', 'e'])
obj2

a   -5.3
b    7.2
d    4.5
c    3.6
e    NaN
dtype: float64

In [19]:
obj2 = obj2.reindex(['a', 'b', 'd', 'c', 'e'], fill_value = 0)  # 指定不存在元素的默认值
obj2

a   -5.3
b    7.2
d    4.5
c    3.6
e    NaN
dtype: float64

### 重新指定索引并指定填元素充方法

In [10]:
obj3 = Series(['blue', 'purple', 'yellow'], index = [0, 2, 4])
obj3

0      blue
2    purple
4    yellow
dtype: object

In [17]:
obj3 = obj3.reindex(range(6), method = 'ffill')
obj3

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

* 对DataFrame重新指定索引

In [15]:
frame = DataFrame(np.arange(9).reshape(3, 3),
                  index = ['a', 'c', 'd'],
                  columns = ['Ohio', 'Texas', 'California'])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [16]:
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


### 重新指定column

In [20]:
states = ['Texas', 'Utah', 'California']
frame.reindex(columns = states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


### 对DataFrame重新指定索引并指定填元素充方法

In [31]:
##frame.reindex(index = ['a', 'b', 'c', 'd'],method = 'ffill',columns = states)

In [26]:
frame.ix[['a', 'b', 'd', 'c'], states]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Unnamed: 0,Texas,Utah,California
a,1.0,,2.0
b,,,
d,7.0,,8.0
c,4.0,,5.0


# 丢弃指定轴上的项

### Series根据索引删除元素

In [32]:
obj = Series(np.arange(5.), index = ['a', 'b', 'c', 'd', 'e'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [34]:
new_obj = obj.drop('c')
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

### DataFrame删除元素，可指定索引或列

In [35]:
data = DataFrame(np.arange(16).reshape((4, 4)),
                  index = ['Ohio', 'Colorado', 'Utah', 'New York'],
                  columns = ['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [36]:
data.drop(['Colorado', 'Ohio'])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [37]:
data.drop('two', axis = 1)

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [38]:
data.drop(['two', 'four'], axis = 1)

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14


# 索引、选取和过滤

### Series的索引，默认数字索引可以工作

In [39]:
obj = Series(np.arange(4.), index = ['a', 'b', 'c', 'd'])
obj['b']

1.0

In [40]:
obj[3]

3.0

In [41]:
obj[[1, 3]]

b    1.0
d    3.0
dtype: float64

In [42]:
obj[obj < 2]

a    0.0
b    1.0
dtype: float64

### Series的数组切片

In [43]:
obj['b':'c']  # 闭区间

b    1.0
c    2.0
dtype: float64

In [44]:
obj['b':'c'] = 5

In [45]:
obj

a    0.0
b    5.0
c    5.0
d    3.0
dtype: float64

### DataFrame的索引

In [48]:
data = DataFrame(np.arange(16).reshape((4, 4)),
                  index = ['Ohio', 'Colorado', 'Utah', 'New York'],
                  columns = ['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [49]:
data['two'] # 打印列

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int64

In [50]:
data[['three', 'one']]

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
New York,14,12


In [51]:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [52]:
data.ix['Colorado', ['two', 'three']] # 指定索引和列

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


two      5
three    6
Name: Colorado, dtype: int64

In [53]:
data.ix[['Colorado', 'Utah'], [3, 0, 1]]

Unnamed: 0,four,one,two
Colorado,7,4,5
Utah,11,8,9


In [54]:
data.ix[2]  # 打印第2行（从0开始）

one       8
two       9
three    10
four     11
Name: Utah, dtype: int64

In [56]:
data.ix[:'Utah', 'two'] # 从开始到Utah，第2列。

Ohio        1
Colorado    5
Utah        9
Name: two, dtype: int64

### 根据条件选择

In [57]:
data[data.three > 5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [58]:
data < 5  # 打印True或者False

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


In [59]:
data[data < 5] = 0

In [60]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


# 算术运算和数据对齐

### 加法

In [61]:
s1 = Series([7.3, -2.5, 3.4, 1.5], index = ['a', 'c', 'd', 'e'])
s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index = ['a', 'c', 'e', 'f', 'g'])

In [62]:
s1

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [63]:
s2

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [64]:
s1 + s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

### DataFrame加法，索引和列都必须匹配

In [65]:
df1 = DataFrame(np.arange(9.).reshape((3, 3)),
                columns = list('bcd'),
                index = ['Ohio', 'Texas', 'Colorado'])
df2 = DataFrame(np.arange(12).reshape((4, 3)),
                columns = list('bde'),
                index = ['Utah', 'Ohio', 'Texas', 'Oregon'])

In [66]:
df1

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


In [67]:
df2

Unnamed: 0,b,d,e
Utah,0,1,2
Ohio,3,4,5
Texas,6,7,8
Oregon,9,10,11


In [68]:
df1 + df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


### 数据填充

In [69]:
arr = np.arange(12.).reshape((3, 4))

In [70]:
arr

array([[  0.,   1.,   2.,   3.],
       [  4.,   5.,   6.,   7.],
       [  8.,   9.,  10.,  11.]])

In [71]:
arr[0]

array([ 0.,  1.,  2.,  3.])

In [72]:
arr - arr[0]

array([[ 0.,  0.,  0.,  0.],
       [ 4.,  4.,  4.,  4.],
       [ 8.,  8.,  8.,  8.]])

In [73]:
frame = DataFrame(np.arange(12).reshape((4, 3)),
                  columns = list('bde'),
                  index = ['Utah', 'Ohio', 'Texas', 'Oregon'])

In [74]:
frame

Unnamed: 0,b,d,e
Utah,0,1,2
Ohio,3,4,5
Texas,6,7,8
Oregon,9,10,11


In [78]:
series = frame.ix[0]

In [79]:
series

b    0
d    1
e    2
Name: Utah, dtype: int64

In [80]:
frame - series

Unnamed: 0,b,d,e
Utah,0,0,0
Ohio,3,3,3
Texas,6,6,6
Oregon,9,9,9


In [81]:
series2 = Series(range(3), index = list('bef'))
series2

b    0
e    1
f    2
dtype: int64

In [82]:
frame + series2

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


In [83]:
series3 = frame['d']
series3

Utah       1
Ohio       4
Texas      7
Oregon    10
Name: d, dtype: int64

In [89]:
frame.sub(series3, axis = 0)  # 按列减

Unnamed: 0,b,d,e
Utah,-1,0,1
Ohio,-1,0,1
Texas,-1,0,1
Oregon,-1,0,1


In [85]:
frame

Unnamed: 0,b,d,e
Utah,0,1,2
Ohio,3,4,5
Texas,6,7,8
Oregon,9,10,11


# 函数应用和映射

### 函数

In [90]:
frame = DataFrame(np.random.randn(4, 3),
                  columns = list('bde'),
                  index = ['Utah', 'Ohio', 'Texas', 'Oregon'])
frame

Unnamed: 0,b,d,e
Utah,-1.399382,-0.193409,1.420274
Ohio,-0.533227,1.180376,-0.231387
Texas,1.193312,0.149069,-0.73392
Oregon,-1.484369,-0.227977,-0.16083


In [91]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,1.399382,0.193409,1.420274
Ohio,0.533227,1.180376,0.231387
Texas,1.193312,0.149069,0.73392
Oregon,1.484369,0.227977,0.16083


### lambda以及应用

In [92]:
f = lambda x: x.max() - x.min()

In [93]:
frame.apply(f)

b    2.677681
d    1.408353
e    2.154194
dtype: float64

In [94]:
frame.apply(f, axis = 1)

Utah      2.819656
Ohio      1.713602
Texas     1.927232
Oregon    1.323539
dtype: float64

### applymap和map

In [95]:
_format = lambda x: '%.2f' % x

In [96]:
frame.applymap(_format)

Unnamed: 0,b,d,e
Utah,-1.4,-0.19,1.42
Ohio,-0.53,1.18,-0.23
Texas,1.19,0.15,-0.73
Oregon,-1.48,-0.23,-0.16


In [97]:
frame['e'].map(_format)

Utah       1.42
Ohio      -0.23
Texas     -0.73
Oregon    -0.16
Name: e, dtype: object

# 排序和排名

### 根据索引排序，对于DataFrame可以指定轴。

In [98]:
obj = Series(range(4), index = ['d', 'a', 'b', 'c'])
obj

d    0
a    1
b    2
c    3
dtype: int64

In [99]:
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [100]:
frame = DataFrame(np.arange(8).reshape((2, 4)),
                  index = ['three', 'one'],
                  columns = list('dabc'))
frame

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [101]:
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [102]:
frame.sort_index(axis = 1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [103]:
frame.sort_index(axis = 1, ascending = False) # 降序

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


# 带有重复值的索引

### 重复的索引

In [104]:
obj = Series(range(5), index = ['a', 'a', 'b', 'b', 'c'])

In [105]:
obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [106]:
obj.index.is_unique # 判断是非有重复索引

False

In [134]:
obj['a'].values[0]

0

In [118]:
df = DataFrame(np.random.randn(4, 3), index = ['a', 'a', 'b', 'b'])
df

Unnamed: 0,0,1,2
a,1.25022,-0.868131,0.246694
a,1.100308,0.521829,1.249683
b,0.666379,1.356003,-1.986234
b,0.5176,-2.178416,1.431825


In [119]:
df.ix['b'].ix[0]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


0    0.666379
1    1.356003
2   -1.986234
Name: b, dtype: float64

In [120]:
df.ix['b'].ix[1]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


0    0.517600
1   -2.178416
2    1.431825
Name: b, dtype: float64