# Indexing and Selecting Data

## 像使用SQL一样使用pandas

In [1]:
import numpy as np
import pandas as pd

Python和Numpy的索引操作符[]和属性操作符. 提供了快速获取pandas数据结构的方式。然而，这两种方式的效率在pandas中可能不是最优的，我们推荐使用优化过的pandas数据获取方法。这些方法都是本节要介绍的。

# 索引的不同方式 Different Choices for Indexing

pandas支持三种类型的多坐标(multi-axis)索引：
* **.loc** 是基本的基于label的，当然也可以和一个boolean数组一起使用。
* **.iloc** 是基本的基于整数位置(从0到axis的length-1)的，当然也可以和一个boolean数组一起使用。当提供检索的index越界时会有IndexError错误，注意切片索引(slice index)允许越界。
* **.ix** 支持基于label和整数位置混合的数据获取方式。默认是基本label的. .ix是最常用的方式，它支持所有.loc和.iloc的输入。如果提供的是纯label或纯整数索引，我们建议使用.loc或 .iloc。


以 .loc为例看一下使用方式：

**对象类型**  |  **Indexers**

Series | s.loc[indexer]

DataFrame | df.loc[row_indexer, column_indexer]

Panel | p.loc[item_indexer, major_indexer, minor_indexer]

# 最基本的索引和选择

最基本的选择数据方式就是使用[]操作符进行索引，

**对象类型** | ** Selection** | **返回值类型**

Series | series[label],**这里的label是index名** | 常数

DataFrame| frame[colname],**使用列名** | Series对象，相应的colname那一列

Panel | panel[itemname] | DataFrame对象,相应的itemname那一个

下面用示例展示一下

In [2]:
dates = pd.date_range('1/1/2000', periods=8)
dates

DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04',
               '2000-01-05', '2000-01-06', '2000-01-07', '2000-01-08'],
              dtype='datetime64[ns]', freq='D')

In [3]:
df  = pd.DataFrame(np.random.randn(8,4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2000-01-01,-0.56265,-1.226827,0.14955,2.333782
2000-01-02,-1.062558,0.772811,-1.556939,-0.303581
2000-01-03,-1.194126,0.502868,0.82347,1.149401
2000-01-04,0.936506,-0.758176,0.338892,1.351857
2000-01-05,-1.196422,-1.276918,-0.539233,-0.101062
2000-01-06,1.436726,-0.048258,-2.132088,-1.661286
2000-01-07,0.32928,0.368375,-1.552907,-0.066676
2000-01-08,0.857815,-1.992648,-1.192243,-1.028118


In [4]:
panel = pd.Panel({'one':df, 'two':df-df.mean()})
panel

<class 'pandas.core.panel.Panel'>
Dimensions: 2 (items) x 8 (major_axis) x 4 (minor_axis)
Items axis: one to two
Major_axis axis: 2000-01-01 00:00:00 to 2000-01-08 00:00:00
Minor_axis axis: A to D

### 我们使用最基本的[]操作符

In [5]:
s = df['A'] #使用列名
s#返回的是 Series

2000-01-01   -0.562650
2000-01-02   -1.062558
2000-01-03   -1.194126
2000-01-04    0.936506
2000-01-05   -1.196422
2000-01-06    1.436726
2000-01-07    0.329280
2000-01-08    0.857815
Freq: D, Name: A, dtype: float64

### Series使用index索引

In [6]:
s[dates[5]] #使用index名

1.436726247472784

In [7]:
panel['two']

Unnamed: 0,A,B,C,D
2000-01-01,-0.505721,-0.76948,0.857238,2.124492
2000-01-02,-1.00563,1.230158,-0.849252,-0.512871
2000-01-03,-1.137198,0.960215,1.531157,0.940112
2000-01-04,0.993435,-0.30083,1.046579,1.142567
2000-01-05,-1.139493,-0.819572,0.168454,-0.310352
2000-01-06,1.493655,0.409088,-1.4244,-1.870575
2000-01-07,0.386208,0.825722,-0.84522,-0.275965
2000-01-08,0.914743,-1.535301,-0.484555,-1.237407


### **也可以给[]传递一个column name组成的的list，形如df[[col1,col2]], 如果给出的某个列名不存在，会报错**

In [8]:
df

Unnamed: 0,A,B,C,D
2000-01-01,-0.56265,-1.226827,0.14955,2.333782
2000-01-02,-1.062558,0.772811,-1.556939,-0.303581
2000-01-03,-1.194126,0.502868,0.82347,1.149401
2000-01-04,0.936506,-0.758176,0.338892,1.351857
2000-01-05,-1.196422,-1.276918,-0.539233,-0.101062
2000-01-06,1.436726,-0.048258,-2.132088,-1.661286
2000-01-07,0.32928,0.368375,-1.552907,-0.066676
2000-01-08,0.857815,-1.992648,-1.192243,-1.028118


In [9]:
df[['B', 'A']] = df[['A', 'B']]
df

Unnamed: 0,A,B,C,D
2000-01-01,-1.226827,-0.56265,0.14955,2.333782
2000-01-02,0.772811,-1.062558,-1.556939,-0.303581
2000-01-03,0.502868,-1.194126,0.82347,1.149401
2000-01-04,-0.758176,0.936506,0.338892,1.351857
2000-01-05,-1.276918,-1.196422,-0.539233,-0.101062
2000-01-06,-0.048258,1.436726,-2.132088,-1.661286
2000-01-07,0.368375,0.32928,-1.552907,-0.066676
2000-01-08,-1.992648,0.857815,-1.192243,-1.028118


# 通过属性访问 把column作为DataFrame对象的属性

可以直接把Series的index、DataFrame中的column、Panel中的item作为这些对象的属性使用，然后直接访问相应的index、column、item

In [10]:
sa = pd.Series([1,2,3],index=list('abc'))
dfa = df.copy()

In [11]:
sa

a    1
b    2
c    3
dtype: int64

In [12]:
sa.b #直接把index作为属性

2

In [13]:
dfa

Unnamed: 0,A,B,C,D
2000-01-01,-1.226827,-0.56265,0.14955,2.333782
2000-01-02,0.772811,-1.062558,-1.556939,-0.303581
2000-01-03,0.502868,-1.194126,0.82347,1.149401
2000-01-04,-0.758176,0.936506,0.338892,1.351857
2000-01-05,-1.276918,-1.196422,-0.539233,-0.101062
2000-01-06,-0.048258,1.436726,-2.132088,-1.661286
2000-01-07,0.368375,0.32928,-1.552907,-0.066676
2000-01-08,-1.992648,0.857815,-1.192243,-1.028118


In [14]:
dfa.A

2000-01-01   -1.226827
2000-01-02    0.772811
2000-01-03    0.502868
2000-01-04   -0.758176
2000-01-05   -1.276918
2000-01-06   -0.048258
2000-01-07    0.368375
2000-01-08   -1.992648
Freq: D, Name: A, dtype: float64

In [15]:
panel.one

Unnamed: 0,A,B,C,D
2000-01-01,-0.56265,-1.226827,0.14955,2.333782
2000-01-02,-1.062558,0.772811,-1.556939,-0.303581
2000-01-03,-1.194126,0.502868,0.82347,1.149401
2000-01-04,0.936506,-0.758176,0.338892,1.351857
2000-01-05,-1.196422,-1.276918,-0.539233,-0.101062
2000-01-06,1.436726,-0.048258,-2.132088,-1.661286
2000-01-07,0.32928,0.368375,-1.552907,-0.066676
2000-01-08,0.857815,-1.992648,-1.192243,-1.028118


In [16]:
sa

a    1
b    2
c    3
dtype: int64

In [17]:
sa.a = 5
sa

a    5
b    2
c    3
dtype: int64

In [18]:
sa

a    5
b    2
c    3
dtype: int64

In [19]:
dfa.A=list(range(len(dfa.index))) # ok if A already exists

In [20]:
dfa

Unnamed: 0,A,B,C,D
2000-01-01,0,-0.56265,0.14955,2.333782
2000-01-02,1,-1.062558,-1.556939,-0.303581
2000-01-03,2,-1.194126,0.82347,1.149401
2000-01-04,3,0.936506,0.338892,1.351857
2000-01-05,4,-1.196422,-0.539233,-0.101062
2000-01-06,5,1.436726,-2.132088,-1.661286
2000-01-07,6,0.32928,-1.552907,-0.066676
2000-01-08,7,0.857815,-1.192243,-1.028118


In [21]:
dfa['A'] = list(range(len(dfa.index)))  # use this form to create a new column
dfa

Unnamed: 0,A,B,C,D
2000-01-01,0,-0.56265,0.14955,2.333782
2000-01-02,1,-1.062558,-1.556939,-0.303581
2000-01-03,2,-1.194126,0.82347,1.149401
2000-01-04,3,0.936506,0.338892,1.351857
2000-01-05,4,-1.196422,-0.539233,-0.101062
2000-01-06,5,1.436726,-2.132088,-1.661286
2000-01-07,6,0.32928,-1.552907,-0.066676
2000-01-08,7,0.857815,-1.192243,-1.028118


## 使用属性和[] 有一点区别：

如果要新建一个column，只能使用[]!!!!!!!!!!!!!
毕竟属性的含义就是现在存在的！！！不存在的列名当然不是属性了






You can use attribute access to modify an existing element of a Series or column of a DataFrame, but be careful; if you try to use attribute access to create a new column, it fails silently, creating a new attribute rather than a new column.

## 使用属性要注意的：
* 如果一个已经存在的函数和列名相同，则不存在相应的属性哦
* 总而言之，属性的适用范围要比[]小

# 切片范围 Slicing ranges

可以使用 [] 还有.iloc切片，这里先介绍使用[]

**对于Series来说，使用[]进行切片就像ndarray一样，**

In [22]:
s

2000-01-01   -1.226827
2000-01-02    0.772811
2000-01-03    0.502868
2000-01-04   -0.758176
2000-01-05   -1.276918
2000-01-06   -0.048258
2000-01-07    0.368375
2000-01-08   -1.992648
Freq: D, Name: A, dtype: float64

In [23]:
s[:5]

2000-01-01   -1.226827
2000-01-02    0.772811
2000-01-03    0.502868
2000-01-04   -0.758176
2000-01-05   -1.276918
Freq: D, Name: A, dtype: float64

In [24]:
s[::2]

2000-01-01   -1.226827
2000-01-03    0.502868
2000-01-05   -1.276918
2000-01-07    0.368375
Freq: 2D, Name: A, dtype: float64

In [25]:
s[::-1]

2000-01-08   -1.992648
2000-01-07    0.368375
2000-01-06   -0.048258
2000-01-05   -1.276918
2000-01-04   -0.758176
2000-01-03    0.502868
2000-01-02    0.772811
2000-01-01   -1.226827
Freq: -1D, Name: A, dtype: float64

### []不光可以检索，也可以赋值

In [30]:
s2 = s.copy()

In [31]:
s2[:5]=0 #赋值

In [32]:
s2

2000-01-01    0.000000
2000-01-02    0.000000
2000-01-03    0.000000
2000-01-04    0.000000
2000-01-05    0.000000
2000-01-06   -0.048258
2000-01-07    0.368375
2000-01-08   -1.992648
Freq: D, Name: A, dtype: float64

对于DataFrame对象来说，[]操作符按照行进行切片，非常有用。

In [33]:
df[:3]

Unnamed: 0,A,B,C,D
2000-01-01,-1.226827,-0.56265,0.14955,2.333782
2000-01-02,0.772811,-1.062558,-1.556939,-0.303581
2000-01-03,0.502868,-1.194126,0.82347,1.149401


In [34]:
df[::-1]

Unnamed: 0,A,B,C,D
2000-01-08,-1.992648,0.857815,-1.192243,-1.028118
2000-01-07,0.368375,0.32928,-1.552907,-0.066676
2000-01-06,-0.048258,1.436726,-2.132088,-1.661286
2000-01-05,-1.276918,-1.196422,-0.539233,-0.101062
2000-01-04,-0.758176,0.936506,0.338892,1.351857
2000-01-03,0.502868,-1.194126,0.82347,1.149401
2000-01-02,0.772811,-1.062558,-1.556939,-0.303581
2000-01-01,-1.226827,-0.56265,0.14955,2.333782


# 使用Label进行检索

## 警告：


.loc要求检索时输入必须严格遵守index的类型，一旦输入类型不对，将会引起TypeError。

In [37]:
df1 = pd.DataFrame(np.random.rand(5,4), columns=list('ABCD'), index=pd.date_range('20160101',periods=5))
df1

Unnamed: 0,A,B,C,D
2016-01-01,0.407321,0.934011,0.033028,0.554854
2016-01-02,0.467473,0.355926,0.955237,0.616306
2016-01-03,0.724564,0.591618,0.343721,0.067257
2016-01-04,0.785883,0.817585,0.313651,0.904628
2016-01-05,0.420384,0.371266,0.237884,0.199242


In [38]:
df1.loc[2:3]

TypeError: cannot do slice indexing on <class 'pandas.tseries.index.DatetimeIndex'> with these indexers [2] of <type 'int'>

输入string进行检索没问题

In [40]:
df1.loc['20160102':'20160104']

Unnamed: 0,A,B,C,D
2016-01-02,0.467473,0.355926,0.955237,0.616306
2016-01-03,0.724564,0.591618,0.343721,0.067257
2016-01-04,0.785883,0.817585,0.313651,0.904628


细心地你一定发现了，index='20160104'那一行也被检索出来了，没错，**loc检索时范围是闭集合[start,end]**.

整型可以作为label检索，这是没问题的，不过要记住此时整型表示的是label而不是index中的下标！

.loc操作是检索时的基本操作，以下输入格式都是合法的：
* 一个label，比如：5、'a'. 记住这里的5表示的是index中的一个label而不是index中的一个下标。
* label组成的列表或者数组比如['a','b','c']
* 切片，比如'a':'f'.注意loc中切片范围是闭集合！
* 布尔数组

In [42]:
s1 = pd.Series(np.random.randn(6), index=list('abcdef'))
s1

a    1.997099
b    1.300519
c   -1.106530
d   -0.440740
e    0.121859
f    0.443966
dtype: float64

In [43]:
s1.loc['c':]

c   -1.106530
d   -0.440740
e    0.121859
f    0.443966
dtype: float64

In [44]:
s1.loc['b']

1.3005189713025145

loc同样支持赋值操作

In [45]:
s1.loc['c':]=0
s1

a    1.997099
b    1.300519
c    0.000000
d    0.000000
e    0.000000
f    0.000000
dtype: float64

再来看看DataFramed的例子

In [46]:
df1 = pd.DataFrame(np.random.randn(6,4), index=list('abcdef'),columns=list('ABCD'))
df1

Unnamed: 0,A,B,C,D
a,1.627536,-0.029482,0.36821,0.487035
b,0.173705,1.206988,-0.222946,-1.021869
c,0.582957,-0.378973,0.124913,-0.244752
d,0.725669,0.585394,2.156526,-0.744287
e,1.591898,-0.24935,-1.541102,1.290049
f,-1.862669,-0.091273,0.986054,-0.550419


In [47]:
df1.loc[['a','b','c','d'],:]

Unnamed: 0,A,B,C,D
a,1.627536,-0.029482,0.36821,0.487035
b,0.173705,1.206988,-0.222946,-1.021869
c,0.582957,-0.378973,0.124913,-0.244752
d,0.725669,0.585394,2.156526,-0.744287


In [49]:
df1.loc[['a','b','c','d']] #可以省略 ':'

Unnamed: 0,A,B,C,D
a,1.627536,-0.029482,0.36821,0.487035
b,0.173705,1.206988,-0.222946,-1.021869
c,0.582957,-0.378973,0.124913,-0.244752
d,0.725669,0.585394,2.156526,-0.744287


使用切片检索

In [51]:
df1.loc['d':,'A':'C'] #注意是闭集合

Unnamed: 0,A,B,C
d,0.725669,0.585394,2.156526
e,1.591898,-0.24935,-1.541102
f,-1.862669,-0.091273,0.986054


In [59]:
df1.loc['a']

A    1.627536
B   -0.029482
C    0.368210
D    0.487035
Name: a, dtype: float64

使用布尔数组检索

In [54]:
df1.loc['a']>0

A     True
B    False
C     True
D     True
Name: a, dtype: bool

In [62]:
df1.loc[:,df1.loc['a']>0]

Unnamed: 0,A,C,D
a,1.627536,0.36821,0.487035
b,0.173705,-0.222946,-1.021869
c,0.582957,0.124913,-0.244752
d,0.725669,2.156526,-0.744287
e,1.591898,-1.541102,1.290049
f,-1.862669,0.986054,-0.550419


得到DataFrame中的某一个值, 等同于df1.get_value('a','A')

In [64]:
df1.loc['a','A']

1.6275362166018037

In [65]:
df1.get_value('a','A')

1.6275362166018037

# 根据下标进行检索 Selection By Position