# 一.索引对象Index

In [1]:
import numpy as np
import pandas as pd

## 1. Series和DataFrame中的索引都是Index对象

In [2]:
ps1 = pd.Series(np.arange(5), index=['a', 'b', 'c', 'd', 'e'])
print(ps1)
print(type(ps1.index))

a    0
b    1
c    2
d    3
e    4
dtype: int64
<class 'pandas.core.indexes.base.Index'>


In [3]:
pd1 = pd.DataFrame(np.arange(9).reshape(3,3), index=['a', 'b', 'c'], columns=['A', 'B', 'C'])
print(type(pd1.index))
pd1

<class 'pandas.core.indexes.base.Index'>


Unnamed: 0,A,B,C
a,0,1,2
b,3,4,5
c,6,7,8


## 2. 索引对象不可变，保证了数据的安全

In [4]:
#ps.index[0] = 2 # 会报错

In [5]:
#pd1.index[1] = '2' # 会报错

## 3. 常见的Index种类
- Index，索引
- Int64Index，整数索引
- MultiIndex，层级索引
- DatetimeIndex，时间戳类型

# 二.索引的 一些基本操作
- 重新索引
- 增
- 删
- 改
- 查

## 1. 重新索引

In [6]:
# reindex 创建一个符合新索引的新对象
ps2 = ps1.reindex(['a', 'b', 'c', 'd', 'e', 'f'])
ps2

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
f    NaN
dtype: float64

In [7]:
# 行索引重建
pd2 = pd1.reindex(['a', 'b', 'c', 'd'])
pd2

Unnamed: 0,A,B,C
a,0.0,1.0,2.0
b,3.0,4.0,5.0
c,6.0,7.0,8.0
d,,,


In [8]:
# 列索引重建
pd3 = pd1.reindex(columns=['C', 'B', 'A'])
pd3

Unnamed: 0,C,B,A
a,2,1,0
b,5,4,3
c,8,7,6


## 增

In [9]:
ps1

a    0
b    1
c    2
d    3
e    4
dtype: int64

In [10]:
ps1['g'] = 9
ps1

a    0
b    1
c    2
d    3
e    4
g    9
dtype: int64

In [11]:
# append 可以不改变原有的表格数据
s1 = pd.Series({'f':99})
ps1.append(s1)


a     0
b     1
c     2
d     3
e     4
g     9
f    99
dtype: int64

In [12]:
pd1

Unnamed: 0,A,B,C
a,0,1,2
b,3,4,5
c,6,7,8


In [13]:
# 末尾增加列
pd1[4] = [10, 11, 12]
pd1

Unnamed: 0,A,B,C,4
a,0,1,2,10
b,3,4,5,11
c,6,7,8,12


In [14]:
# 插入增加列
pd1.insert(0, 'E', [7,7,7])
pd1

Unnamed: 0,E,A,B,C,4
a,7,0,1,2,10
b,7,3,4,5,11
c,7,6,7,8,12


In [15]:
# 增加行
# 使用高级索引中的标签索引 loc
pd1.loc['d'] = [0,0,0,0,0]
pd1

Unnamed: 0,E,A,B,C,4
a,7,0,1,2,10
b,7,3,4,5,11
c,7,6,7,8,12
d,0,0,0,0,0


In [16]:
# 使用append
row = {'E':6, 'A':6, 'B':6, 'C':6, 4:6}
pd3 = pd1.append(row, ignore_index=True)
pd3

Unnamed: 0,E,A,B,C,4
0,7,0,1,2,10
1,7,3,4,5,11
2,7,6,7,8,12
3,0,0,0,0,0
4,6,6,6,6,6


## 删

In [17]:
ps1

a    0
b    1
c    2
d    3
e    4
g    9
dtype: int64

In [18]:
# del
del ps1['b']
ps1

a    0
c    2
d    3
e    4
g    9
dtype: int64

In [19]:
pd1

Unnamed: 0,E,A,B,C,4
a,7,0,1,2,10
b,7,3,4,5,11
c,7,6,7,8,12
d,0,0,0,0,0


In [20]:
del pd1['E']
pd1

Unnamed: 0,A,B,C,4
a,0,1,2,10
b,3,4,5,11
c,6,7,8,12
d,0,0,0,0


In [21]:
# drop 删除轴上的数据，返回新的对象， 它不影响原表格数据
# 删除一条数据
ps1.drop('g')

a    0
c    2
d    3
e    4
dtype: int64

In [22]:
# 删除多条
ps1.drop(['c', 'd'])

a    0
e    4
g    9
dtype: int64

In [23]:
# DataFrame
# 删除行
pd1.drop(['a', 'd'])

Unnamed: 0,A,B,C,4
b,3,4,5,11
c,6,7,8,12


In [24]:
# 指定删除列
pd1.drop(['B'], axis=1)# axis=1是列的删除

Unnamed: 0,A,C,4
a,0,2,10
b,3,5,11
c,6,8,12
d,0,0,0


In [25]:
pd1.drop([4], axis='columns')# 也可以这么写

Unnamed: 0,A,B,C
a,0,1,2
b,3,4,5
c,6,7,8
d,0,0,0


In [26]:
# 注意drop中的 inplace 属性， 当inplace为True时， 可以在原对象上直接进行删除操作，不会返回新的对象
ps1.drop(['g'], inplace=True)
ps1

a    0
c    2
d    3
e    4
dtype: int64

In [27]:
pd1.drop([4], axis=1, inplace=True)
pd1
# 这种inplace方法要慎重使用

Unnamed: 0,A,B,C
a,0,1,2
b,3,4,5
c,6,7,8
d,0,0,0


## 改

In [28]:
ps1 = pd.Series(np.arange(5), index=['a', 'b', 'c', 'd', 'e'])
print(ps1)

a    0
b    1
c    2
d    3
e    4
dtype: int64


In [29]:
pd1 = pd.DataFrame(np.arange(9).reshape(3,3), index=['a', 'b', 'c'], columns=['A', 'B', 'C'])
pd1

Unnamed: 0,A,B,C
a,0,1,2
b,3,4,5
c,6,7,8


In [30]:
ps1['a'] = 999 # 根据索引修改
ps1

a    999
b      1
c      2
d      3
e      4
dtype: int64

In [31]:
ps1[0] = 777 # 下标位置修改
ps1

a    777
b      1
c      2
d      3
e      4
dtype: int64

In [32]:
pd1['A'] = [9, 10, 11] # 根据索引修改
pd1

Unnamed: 0,A,B,C
a,9,1,2
b,10,4,5
c,11,7,8


In [33]:
pd1.A = 77 # 对象.列
pd1

Unnamed: 0,A,B,C
a,77,1,2
b,77,4,5
c,77,7,8


In [34]:
# loc 标签索引
pd1.loc['a'] = 777
pd1

Unnamed: 0,A,B,C
a,777,777,777
b,77,4,5
c,77,7,8


In [35]:
# 直接改某一行某一列的数据
pd1.loc['a', "A"] = 999
pd1

Unnamed: 0,A,B,C
a,999,777,777
b,77,4,5
c,77,7,8


## 查

In [36]:
# Series
ps1

a    777
b      1
c      2
d      3
e      4
dtype: int64

In [37]:
ps1['a']# 标签索引

777

In [38]:
ps1[0]# 位置索引

777

In [39]:
# 2. 切片索引
print(ps1[1:4])# 位置切片索引 左包括右不包括
print(ps1['b':'e'])# 标签切片索引 左右都包括！！！

b    1
c    2
d    3
dtype: int64
b    1
c    2
d    3
e    4
dtype: int64


In [40]:
# 3. 不连续索引
print(ps1[['b', 'e']])
print(ps1[[0, 2, 3]])

b    1
e    4
dtype: int64
a    777
c      2
d      3
dtype: int64


In [41]:
# 4. 布尔索引
ps1[ps1>2]

a    777
d      3
e      4
dtype: int64

In [61]:
# DataFrame
pd1

Unnamed: 0,A,B,C
a,999,777,777
b,77,4,5
c,77,7,8


In [62]:
# 1. 列索引
pd1['A']

a    999
b     77
c     77
Name: A, dtype: int64

In [63]:
# 取多列
pd1[['A', "C"]]

Unnamed: 0,A,C
a,999,777
b,77,5
c,77,8


In [72]:
# 选取一个值
pd1['A']['a']

999

In [75]:
pd1.loc['a', 'A']

999

In [76]:
# 2. 切片索引
pd1[:2] # 获取行

Unnamed: 0,A,B,C
a,999,777,777
b,77,4,5


In [92]:
pd1.iloc[1:3, 1:3]

Unnamed: 0,B,C
b,4,5
c,7,8


# 三. 高级索引
- loc 标签索引
- iloc 位置索引
- ix 标签与位置混合索引

In [47]:
# 1. loc 标签索引
# loc 是基于标签名的索引，自定义的索引名
ps1['a':'c']

a    777
b      1
c      2
dtype: int64

In [48]:
ps1.loc['a':'c']# 一般推荐使用高级索引

a    777
b      1
c      2
dtype: int64

In [49]:
pd1

Unnamed: 0,A,B,C
a,999,777,777
b,77,4,5
c,77,7,8


In [50]:
pd1.loc['a':'b', 'A':'C']# 第一个参数是行索引，第二个是列索引

Unnamed: 0,A,B,C
a,999,777,777
b,77,4,5


In [51]:
# 2. iloc位置索引
pd1.iloc[1:3, 2]

b    5
c    8
Name: C, dtype: int64

In [52]:
# 3. ix 混合索引， 一般不推荐使用
print(pd1.ix['a':'b', 1])
print(pd1.ix[0:2, 'B'])

a    777
b      4
Name: B, dtype: int64
a    777
b      4
Name: B, dtype: int64


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


# 索引练习

In [53]:
import numpy as np
import pandas as pd

In [54]:
# 准备：读入csv文件
lianjia_df = pd.read_csv(r'lianjia.csv')
print(type(lianjia_df))
lianjia_df.head()# 提取前5行

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,Direction,Elevator,Floor,Garden,Id,Layout,Price,Region,Renovation,Size,Year
0,南北,无电梯,低楼层(共6层),翠竹苑,0,3室1厅,365.0,浦东,简装,77.84,1995.0
1,南,有电梯,低楼层(共36层),百汇园,1,3室2厅,1449.0,徐汇,精装,145.2,1995.0
2,南北,有电梯,中楼层(共26层),仁恒河滨城(二期),2,3室2厅,1630.0,浦东,精装,161.94,1995.0
3,南,有电梯,高楼层(共30层),财富海景花园,3,3室2厅,2000.0,浦东,精装,185.0,1995.0
4,东,有电梯,中楼层(共26层),仁恒滨江园,4,3室2厅,1360.0,浦东,精装,130.41,1995.0


In [55]:
# 1. 观察结构，调整列索引顺序
lianjia_df[['Region', 'Garden', 'Layout', 'Floor', 'Id', "Year", 'Size', 'Elevator', 
            'Direction', 'Renovation', 'Price']].head()# 提取前5行

Unnamed: 0,Region,Garden,Layout,Floor,Id,Year,Size,Elevator,Direction,Renovation,Price
0,浦东,翠竹苑,3室1厅,低楼层(共6层),0,1995.0,77.84,无电梯,南北,简装,365.0
1,徐汇,百汇园,3室2厅,低楼层(共36层),1,1995.0,145.2,有电梯,南,精装,1449.0
2,浦东,仁恒河滨城(二期),3室2厅,中楼层(共26层),2,1995.0,161.94,有电梯,南北,精装,1630.0
3,浦东,财富海景花园,3室2厅,高楼层(共30层),3,1995.0,185.0,有电梯,南,精装,2000.0
4,浦东,仁恒滨江园,3室2厅,中楼层(共26层),4,1995.0,130.41,有电梯,东,精装,1360.0


In [56]:
# 2. 增加一列关于目前状况（State，可以使用1代表售出，0代表未售出，注意是随机的！！！）
import random
s = [0, 1]
column_count = lianjia_df.shape[0]# 这里可以计算出Values列的数量（长度）
lianjia_df['State'] = [random.choice(s) for i in range(column_count)]# 这里理解一下，其实就是个for循环对于列的数量
lianjia_df.head()

Unnamed: 0,Direction,Elevator,Floor,Garden,Id,Layout,Price,Region,Renovation,Size,Year,State
0,南北,无电梯,低楼层(共6层),翠竹苑,0,3室1厅,365.0,浦东,简装,77.84,1995.0,0
1,南,有电梯,低楼层(共36层),百汇园,1,3室2厅,1449.0,徐汇,精装,145.2,1995.0,0
2,南北,有电梯,中楼层(共26层),仁恒河滨城(二期),2,3室2厅,1630.0,浦东,精装,161.94,1995.0,1
3,南,有电梯,高楼层(共30层),财富海景花园,3,3室2厅,2000.0,浦东,精装,185.0,1995.0,1
4,东,有电梯,中楼层(共26层),仁恒滨江园,4,3室2厅,1360.0,浦东,精装,130.41,1995.0,1


In [57]:
# 3. 删除Id这一列无用的数据
lianjia_df.drop(['Id'], axis=1, inplace=True)
lianjia_df.head()

Unnamed: 0,Direction,Elevator,Floor,Garden,Layout,Price,Region,Renovation,Size,Year,State
0,南北,无电梯,低楼层(共6层),翠竹苑,3室1厅,365.0,浦东,简装,77.84,1995.0,0
1,南,有电梯,低楼层(共36层),百汇园,3室2厅,1449.0,徐汇,精装,145.2,1995.0,0
2,南北,有电梯,中楼层(共26层),仁恒河滨城(二期),3室2厅,1630.0,浦东,精装,161.94,1995.0,1
3,南,有电梯,高楼层(共30层),财富海景花园,3室2厅,2000.0,浦东,精装,185.0,1995.0,1
4,东,有电梯,中楼层(共26层),仁恒滨江园,3室2厅,1360.0,浦东,精装,130.41,1995.0,1


In [58]:
# 4. 查找楼层小于7的所有房屋 (这里是提取低楼层，用的函数)
def extract_low_floor(floors):
    low_floors = []# 列表里对应最低楼层的位置标号是True，其余是False，最后通过此列表里的真假来进行切片
    for floor in floors:
        if "低楼层" in floor:
            low_floors.append(True)
        else:
            low_floors.append(False)
    return low_floors
low_floor_lianjia_df = lianjia_df[extract_low_floor(lianjia_df['Floor'])]
low_floor_lianjia_df.head()

Unnamed: 0,Direction,Elevator,Floor,Garden,Layout,Price,Region,Renovation,Size,Year,State
0,南北,无电梯,低楼层(共6层),翠竹苑,3室1厅,365.0,浦东,简装,77.84,1995.0,0
1,南,有电梯,低楼层(共36层),百汇园,3室2厅,1449.0,徐汇,精装,145.2,1995.0,0
9,南北,无电梯,低楼层(共6层),金水苑,3室2厅,240.0,奉贤,简装,133.62,1995.0,1
14,南,有电梯,低楼层(共11层),万科城花新园,3室2厅,860.0,闵行,精装,129.73,1995.0,0
15,南,有电梯,低楼层(共18层),鸿凯湾绿苑,3室2厅,1090.0,长宁,简装,120.76,1995.0,1


In [59]:
# 5. 修改楼层小于7的房子电梯状态都是无电梯
low_floor_lianjia_df.ix[:, 'Elevator'] = '无电梯'
low_floor_lianjia_df

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,Direction,Elevator,Floor,Garden,Layout,Price,Region,Renovation,Size,Year,State
0,南北,无电梯,低楼层(共6层),翠竹苑,3室1厅,365.0,浦东,简装,77.84,1995.0,0
1,南,无电梯,低楼层(共36层),百汇园,3室2厅,1449.0,徐汇,精装,145.2,1995.0,0
9,南北,无电梯,低楼层(共6层),金水苑,3室2厅,240.0,奉贤,简装,133.62,1995.0,1
14,南,无电梯,低楼层(共11层),万科城花新园,3室2厅,860.0,闵行,精装,129.73,1995.0,0
15,南,无电梯,低楼层(共18层),鸿凯湾绿苑,3室2厅,1090.0,长宁,简装,120.76,1995.0,1
16,南,无电梯,低楼层(共18层),荣域飘鹰锦和花园(公寓),2室2厅,315.0,宝山,毛坯,90.5,1995.0,1
17,南,无电梯,低楼层(共9层),恒大华城天地苑,3室2厅,768.0,浦东,简装,115.67,1995.0,1
19,南,无电梯,低楼层(共35层),浦东世纪花园(公寓),3室2厅,1400.0,浦东,精装,179.52,1995.0,1
20,南,无电梯,低楼层(共18层),星颂家园,2室2厅,265.0,浦东,精装,72.64,1995.0,0
21,南,无电梯,低楼层(共11层),莲浦府邸(公寓),3室2厅,975.0,闵行,精装,141.21,1995.0,0
