In [1]:
import numpy as np
import pandas as pd 
from pandas import Series, DataFrame

### 删除重复行数据

In [2]:
# 创建日志表
# 列： 日志编号、日志等级、日志名称、日志的消息
# 行： 日志的日期, 2018-10-10
logs = DataFrame({
    'ID': np.arange(1, 11, step=1),
    'LevelName': np.random.choice(['INFO','ERROR', 'WARNNING', 'CRITICAL'],
                                 size=10),
    'Name': np.random.choice(['django', 'flask', 'scrapy'], size=10),
    'Message': np.random.choice(['登录', '下订单', '支付', '积分',
                                 '秒杀', '验证码'], size=10)
},index=[ '2019-04-%s' % str(day).rjust(2, '0') for day in range(1, 11)])
logs

Unnamed: 0,ID,LevelName,Name,Message
2019-04-01,1,CRITICAL,django,秒杀
2019-04-02,2,INFO,scrapy,验证码
2019-04-03,3,ERROR,flask,积分
2019-04-04,4,INFO,scrapy,支付
2019-04-05,5,CRITICAL,scrapy,验证码
2019-04-06,6,CRITICAL,flask,积分
2019-04-07,7,ERROR,flask,秒杀
2019-04-08,8,INFO,scrapy,支付
2019-04-09,9,WARNNING,scrapy,支付
2019-04-10,10,INFO,scrapy,下订单


In [3]:
logs.loc['2019-04-11'] = logs.loc['2019-04-10']
logs

Unnamed: 0,ID,LevelName,Name,Message
2019-04-01,1,CRITICAL,django,秒杀
2019-04-02,2,INFO,scrapy,验证码
2019-04-03,3,ERROR,flask,积分
2019-04-04,4,INFO,scrapy,支付
2019-04-05,5,CRITICAL,scrapy,验证码
2019-04-06,6,CRITICAL,flask,积分
2019-04-07,7,ERROR,flask,秒杀
2019-04-08,8,INFO,scrapy,支付
2019-04-09,9,WARNNING,scrapy,支付
2019-04-10,10,INFO,scrapy,下订单


In [4]:
# 查找重复行数据
# keep = { 'last',  'first'}  
# last表示最后一个重复行不是重复的， first 表示第一个重复行不是重复的，即保留哪一个
logs.duplicated(keep='last')

2019-04-01    False
2019-04-02    False
2019-04-03    False
2019-04-04    False
2019-04-05    False
2019-04-06    False
2019-04-07    False
2019-04-08    False
2019-04-09    False
2019-04-10     True
2019-04-11    False
dtype: bool

In [5]:
# 获取哪些行值为True的行数据
logs.loc[logs.duplicated(keep='last')]

Unnamed: 0,ID,LevelName,Name,Message
2019-04-10,10,INFO,scrapy,下订单


In [6]:
logs.drop(logs[logs.duplicated(keep='last')].index,
          axis=0)

Unnamed: 0,ID,LevelName,Name,Message
2019-04-01,1,CRITICAL,django,秒杀
2019-04-02,2,INFO,scrapy,验证码
2019-04-03,3,ERROR,flask,积分
2019-04-04,4,INFO,scrapy,支付
2019-04-05,5,CRITICAL,scrapy,验证码
2019-04-06,6,CRITICAL,flask,积分
2019-04-07,7,ERROR,flask,秒杀
2019-04-08,8,INFO,scrapy,支付
2019-04-09,9,WARNNING,scrapy,支付
2019-04-11,10,INFO,scrapy,下订单


In [7]:
# 快速删除重复行数据
# inplace =True 在原有的数据上进行修改， 如果=False，表示返回修改后的复本
logs.drop_duplicates(keep='first', inplace=True) # 默认情况保留重复行的第一行
logs

Unnamed: 0,ID,LevelName,Name,Message
2019-04-01,1,CRITICAL,django,秒杀
2019-04-02,2,INFO,scrapy,验证码
2019-04-03,3,ERROR,flask,积分
2019-04-04,4,INFO,scrapy,支付
2019-04-05,5,CRITICAL,scrapy,验证码
2019-04-06,6,CRITICAL,flask,积分
2019-04-07,7,ERROR,flask,秒杀
2019-04-08,8,INFO,scrapy,支付
2019-04-09,9,WARNNING,scrapy,支付
2019-04-10,10,INFO,scrapy,下订单


In [8]:
# 删除 LevelName, ID 列
# 删除 2019-04-05, 2019-04-08 行
# logs.drop(index='2019-04-05', columns='LevelName')
# logs.drop(index=['2019-04-05', '2019-04-08'],
#           columns=['ID', 'LevelName'])

# logs.drop(index=['2019-04-05', '2019-04-08'])
# logs.drop(columns=['ID', 'LevelName'])
# logs.drop(['ID', 'LevelName'], axis=1)
logs.drop(['2019-04-05', '2019-04-08'], axis=0)

Unnamed: 0,ID,LevelName,Name,Message
2019-04-01,1,CRITICAL,django,秒杀
2019-04-02,2,INFO,scrapy,验证码
2019-04-03,3,ERROR,flask,积分
2019-04-04,4,INFO,scrapy,支付
2019-04-06,6,CRITICAL,flask,积分
2019-04-07,7,ERROR,flask,秒杀
2019-04-09,9,WARNNING,scrapy,支付
2019-04-10,10,INFO,scrapy,下订单


### 映射操作
- replace(olds, news) 将 olds的数据替换成news的数据
- map() 根据指定的列映射出相应的数据，如根据成绩，列出等级
- rename() 重命名

In [9]:
logs

Unnamed: 0,ID,LevelName,Name,Message
2019-04-01,1,CRITICAL,django,秒杀
2019-04-02,2,INFO,scrapy,验证码
2019-04-03,3,ERROR,flask,积分
2019-04-04,4,INFO,scrapy,支付
2019-04-05,5,CRITICAL,scrapy,验证码
2019-04-06,6,CRITICAL,flask,积分
2019-04-07,7,ERROR,flask,秒杀
2019-04-08,8,INFO,scrapy,支付
2019-04-09,9,WARNNING,scrapy,支付
2019-04-10,10,INFO,scrapy,下订单


In [10]:
# 将数据中所有ERROR修改成 "错误", INFO改成 '正常'
logs.replace(to_replace='ERROR', value='错误')

Unnamed: 0,ID,LevelName,Name,Message
2019-04-01,1,CRITICAL,django,秒杀
2019-04-02,2,INFO,scrapy,验证码
2019-04-03,3,错误,flask,积分
2019-04-04,4,INFO,scrapy,支付
2019-04-05,5,CRITICAL,scrapy,验证码
2019-04-06,6,CRITICAL,flask,积分
2019-04-07,7,错误,flask,秒杀
2019-04-08,8,INFO,scrapy,支付
2019-04-09,9,WARNNING,scrapy,支付
2019-04-10,10,INFO,scrapy,下订单


In [11]:
logs.replace({
    'ERROR': '错误',
    'INFO': '正常'
})

Unnamed: 0,ID,LevelName,Name,Message
2019-04-01,1,CRITICAL,django,秒杀
2019-04-02,2,正常,scrapy,验证码
2019-04-03,3,错误,flask,积分
2019-04-04,4,正常,scrapy,支付
2019-04-05,5,CRITICAL,scrapy,验证码
2019-04-06,6,CRITICAL,flask,积分
2019-04-07,7,错误,flask,秒杀
2019-04-08,8,正常,scrapy,支付
2019-04-09,9,WARNNING,scrapy,支付
2019-04-10,10,正常,scrapy,下订单


In [12]:
logs.replace(['ERROR','INFO'], ['错误', '正常'])

Unnamed: 0,ID,LevelName,Name,Message
2019-04-01,1,CRITICAL,django,秒杀
2019-04-02,2,正常,scrapy,验证码
2019-04-03,3,错误,flask,积分
2019-04-04,4,正常,scrapy,支付
2019-04-05,5,CRITICAL,scrapy,验证码
2019-04-06,6,CRITICAL,flask,积分
2019-04-07,7,错误,flask,秒杀
2019-04-08,8,正常,scrapy,支付
2019-04-09,9,WARNNING,scrapy,支付
2019-04-10,10,正常,scrapy,下订单


In [13]:
# 获取Name列值的Series,
# 再通过Series对象的map()函数，将每一行对应的数值映射成对应的值
logs['version'] = logs['Name'].map({
    'scrapy': 1.1,
    'django': 2.0,
    'flask': 1.2
})
logs

Unnamed: 0,ID,LevelName,Name,Message,version
2019-04-01,1,CRITICAL,django,秒杀,2.0
2019-04-02,2,INFO,scrapy,验证码,1.1
2019-04-03,3,ERROR,flask,积分,1.2
2019-04-04,4,INFO,scrapy,支付,1.1
2019-04-05,5,CRITICAL,scrapy,验证码,1.1
2019-04-06,6,CRITICAL,flask,积分,1.2
2019-04-07,7,ERROR,flask,秒杀,1.2
2019-04-08,8,INFO,scrapy,支付,1.1
2019-04-09,9,WARNNING,scrapy,支付,1.1
2019-04-10,10,INFO,scrapy,下订单,1.1


In [14]:
# 将行索引转成一列，列名为Date
# drop=True, 表示重新生成行索引标签
logs_2 = logs.reset_index().rename(columns={'index': 'Date'})
logs_2

Unnamed: 0,Date,ID,LevelName,Name,Message,version
0,2019-04-01,1,CRITICAL,django,秒杀,2.0
1,2019-04-02,2,INFO,scrapy,验证码,1.1
2,2019-04-03,3,ERROR,flask,积分,1.2
3,2019-04-04,4,INFO,scrapy,支付,1.1
4,2019-04-05,5,CRITICAL,scrapy,验证码,1.1
5,2019-04-06,6,CRITICAL,flask,积分,1.2
6,2019-04-07,7,ERROR,flask,秒杀,1.2
7,2019-04-08,8,INFO,scrapy,支付,1.1
8,2019-04-09,9,WARNNING,scrapy,支付,1.1
9,2019-04-10,10,INFO,scrapy,下订单,1.1


In [15]:
# 查看Date的类型
logs_2['Date'].dtype

dtype('O')

In [16]:
logs_2.dtypes  # 查看所有列的类型

Date          object
ID             int64
LevelName     object
Name          object
Message       object
version      float64
dtype: object

In [17]:
# 修改version的数据类型 float64 为 string类型
# map()中指定类型转换的函数，将version的每一行的数据经过转换函数转成指定的类型
logs_2['version'] = logs_2['version'].map(str)

In [18]:
logs_2.dtypes

Date         object
ID            int64
LevelName    object
Name         object
Message      object
version      object
dtype: object

In [19]:
# 修改Date列的类型string为datetime类型
from datetime import datetime

def convert_dt(item):
    # item是每行的数值, 类型是str
    # item : 2019-04-01
    return datetime.strptime(item, '%Y-%m-%d')

In [20]:
logs_2['Date'] = logs_2['Date'].map(convert_dt)
logs_2.dtypes

Date         datetime64[ns]
ID                    int64
LevelName            object
Name                 object
Message              object
version              object
dtype: object

In [21]:
# 将Date列的数据转成 月和日的字符串
logs_2['Date'] = logs_2['Date'].map(lambda item: item.strftime('%m-%d'))
logs_2

Unnamed: 0,Date,ID,LevelName,Name,Message,version
0,04-01,1,CRITICAL,django,秒杀,2.0
1,04-02,2,INFO,scrapy,验证码,1.1
2,04-03,3,ERROR,flask,积分,1.2
3,04-04,4,INFO,scrapy,支付,1.1
4,04-05,5,CRITICAL,scrapy,验证码,1.1
5,04-06,6,CRITICAL,flask,积分,1.2
6,04-07,7,ERROR,flask,秒杀,1.2
7,04-08,8,INFO,scrapy,支付,1.1
8,04-09,9,WARNNING,scrapy,支付,1.1
9,04-10,10,INFO,scrapy,下订单,1.1


In [22]:
# 创建五位同学的三个课程的成绩表
# 五位同学的姓名分别是A, B, C, D, E
# 三门课程分别是 Python, Java, H5
scores = DataFrame(np.random.randint(40, 100, size=(5, 3)),
                  index=list('ABCDE'),
                  columns=['Python', 'Java', 'H5'])
scores

Unnamed: 0,Python,Java,H5
A,67,79,41
B,49,57,89
C,75,82,69
D,59,83,49
E,96,41,94


In [23]:
# 增加一个 level列，根据Python列值，生成不同的等级，详情如下：
# >=90            -> A
# < 90 and  >= 75 -> B
# < 75 and >=60   -> C
# < 60            -> D
# 三目运算， Java/C->  item>=90?'A': item>=75?'B': item>=60?'C':'D'
level_convert = lambda item: 'A' if item >=90 else 'B' if item >=75 else 'C' if item>=60 else 'D'

scores['level'] = scores['Python'].map(level_convert)
scores

Unnamed: 0,Python,Java,H5,level
A,67,79,41,C
B,49,57,89,D
C,75,82,69,B
D,59,83,49,D
E,96,41,94,A


In [24]:
# 删除level列
scores.drop(columns='level', inplace=True)
scores

Unnamed: 0,Python,Java,H5
A,67,79,41
B,49,57,89
C,75,82,69
D,59,83,49
E,96,41,94


#### 扩展使用 transform()
- 功能，同map()函数，将某一行的值转成别一个行值
- 不同的是，map()函数支持dict映射， 而transform()函数不支持dict映射

In [25]:
scores['level'] = scores['Python'].transform(level_convert)
scores

Unnamed: 0,Python,Java,H5,level
A,67,79,41,C
B,49,57,89,D
C,75,82,69,B
D,59,83,49,D
E,96,41,94,A


思考： 如果拿出三个课程（三列），如何进行映射？

In [26]:
def total_items(item):
    display(item)
    return item

In [27]:
# transform可以针对多列进行映射, 但是，必须同分组函数一块使用
# 计算三门课程的总成绩, 并生成总成绩列
# scores['total'] = scores[['Python', 'Java', 'H5']].transform(total_items)
scores['total'] = scores.sum(axis=1)
scores

Unnamed: 0,Python,Java,H5,level,total
A,67,79,41,C,187
B,49,57,89,D,195
C,75,82,69,B,226
D,59,83,49,D,191
E,96,41,94,A,231


练习：
新增两列，分别为张三、李四的成绩状态，如果分数低于90，则为"failed"，如果分数高于120，则为"excellent"，其他则为"pass"
【提示】使用函数作为map的参数

In [28]:
scores2 = DataFrame({
    '张三': [50, 99, 130],
    '李四': [75, 150, 100],
    '王五': [90, 79, 110]
}, index=['Python', 'Flask', 'Scrapy'])
scores2

Unnamed: 0,张三,李四,王五
Python,50,75,90
Flask,99,150,79
Scrapy,130,100,110


In [29]:
level_func = lambda item: 'failed' if item <90  else 'excellent' if item > 120 else 'pass'
scores2['level_z'] = scores2['张三'].map(level_func)
scores2['level_l'] = scores2['李四'].map(level_func)
scores2

Unnamed: 0,张三,李四,王五,level_z,level_l
Python,50,75,90,failed,failed
Flask,99,150,79,pass,excellent
Scrapy,130,100,110,excellent,pass


In [30]:
DataFrame(scores2.unstack()).unstack()

Unnamed: 0_level_0,0,0,0
Unnamed: 0_level_1,Python,Flask,Scrapy
张三,50,99,130
李四,75,150,100
王五,90,79,110
level_z,failed,pass,excellent
level_l,failed,excellent,pass


In [31]:
scores2.transpose()

Unnamed: 0,Python,Flask,Scrapy
张三,50,99,130
李四,75,150,100
王五,90,79,110
level_z,failed,pass,excellent
level_l,failed,excellent,pass


### rename() 替换索引标签
- 行索引标签,  index
- 列索引标签 , columns
- 多层索引标签， level属性

In [33]:
scores.drop(['level', 'total'], axis=1, inplace=True)
scores

Unnamed: 0,Python,Java,H5
A,67,79,41
B,49,57,89
C,75,82,69
D,59,83,49
E,96,41,94


In [34]:
# 修改 A的行标签为disen, B标签为jack
scores.rename(index={
    'A':'disen',
    'B': 'jack'
})

Unnamed: 0,Python,Java,H5
disen,67,79,41
jack,49,57,89
C,75,82,69
D,59,83,49
E,96,41,94


In [35]:
# 修改 Python为Py3, H5为HTML5
scores.rename(columns={
    'Python': 'Py3',
    'H5': 'HTML5'
})

Unnamed: 0,Py3,Java,HTML5
A,67,79,41
B,49,57,89
C,75,82,69
D,59,83,49
E,96,41,94


In [37]:
s2 = pd.concat((scores, scores), axis=1, keys=('上学期', '下学期'))
s2

Unnamed: 0_level_0,上学期,上学期,上学期,下学期,下学期,下学期
Unnamed: 0_level_1,Python,Java,H5,Python,Java,H5
A,67,79,41,67,79,41
B,49,57,89,49,57,89
C,75,82,69,75,82,69
D,59,83,49,59,83,49
E,96,41,94,96,41,94


In [41]:
# 修改s2表中的上学期为"期中"， 下学期为"期末"
s2.rename({
    '上学期': '期中',
    '下学期': '期末'
}, axis=1, level=0)

Unnamed: 0_level_0,期中,期中,期中,期末,期末,期末
Unnamed: 0_level_1,Python,Java,H5,Python,Java,H5
A,67,79,41,67,79,41
B,49,57,89,49,57,89
C,75,82,69,75,82,69
D,59,83,49,59,83,49
E,96,41,94,96,41,94


思考： 如何修改上学期的Python为Py3

<font color=red>注： 只能整体修改，不能单列修改</font>

In [42]:
s2.rename(columns={'Python': 'Py3'}) # 整体修改

Unnamed: 0_level_0,上学期,上学期,上学期,下学期,下学期,下学期
Unnamed: 0_level_1,Py3,Java,H5,Py3,Java,H5
A,67,79,41,67,79,41
B,49,57,89,49,57,89
C,75,82,69,75,82,69
D,59,83,49,59,83,49
E,96,41,94,96,41,94


In [53]:
# 获取Series，并修改name, 部分修改，结果没有修改成功
s2[('上学期', 'Python')].name = ('上学期', 'Py3') 
s2

Unnamed: 0_level_0,上学期,上学期,上学期,下学期,下学期,下学期
Unnamed: 0_level_1,Python,Java,H5,Python,Java,H5
A,67,79,41,67,79,41
B,49,57,89,49,57,89
C,75,82,69,75,82,69
D,59,83,49,59,83,49
E,96,41,94,96,41,94


In [54]:
s2.rename(columns={('上学期', 'Python'): ('上学期', 'Py3')})

Unnamed: 0_level_0,上学期,上学期,上学期,下学期,下学期,下学期
Unnamed: 0_level_1,Python,Java,H5,Python,Java,H5
A,67,79,41,67,79,41
B,49,57,89,49,57,89
C,75,82,69,75,82,69
D,59,83,49,59,83,49
E,96,41,94,96,41,94


In [55]:
p3 = s2[('上学期', 'Python')]
p3.name = ('上学期', 'Py3')
p3.name

('上学期', 'Py3')

In [45]:
s2.rename_axis('id') # 修改列标签或行标签的名称， columns或index对象的名称

Unnamed: 0_level_0,上学期,上学期,上学期,下学期,下学期,下学期
Unnamed: 0_level_1,Python,Java,H5,Python,Java,H5
id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A,67,79,41,67,79,41
B,49,57,89,49,57,89
C,75,82,69,75,82,69
D,59,83,49,59,83,49
E,96,41,94,96,41,94


In [49]:
s2.columns

MultiIndex(levels=[['上学期', '下学期'], ['Python', 'Java', 'H5']],
           labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]])

### 异常值检测和过滤
- describe()
- any() 和 std()

In [60]:
scores.describe() # 统计每一列的数据分布

Unnamed: 0,Python,Java,H5
count,5.0,5.0,5.0
mean,69.2,68.4,68.4
std,17.810109,18.649397,23.490424
min,49.0,41.0,41.0
25%,59.0,57.0,49.0
50%,67.0,79.0,69.0
75%,75.0,82.0,89.0
max,96.0,83.0,94.0


In [63]:
scores.std(axis=1) # 查看每行数据的标准差，查看每人的各科成绩的分布情况

A    19.425070
B    21.166010
C     6.506407
D    17.473790
E    31.192948
dtype: float64

In [64]:
scores

Unnamed: 0,Python,Java,H5
A,67,79,41
B,49,57,89
C,75,82,69
D,59,83,49
E,96,41,94


In [68]:
# 指定百分比显示的段
scores.describe(percentiles=[.65, .85]) # 默认 .5是必须存在的

Unnamed: 0,Python,Java,H5
count,5.0,5.0,5.0
mean,69.2,68.4,68.4
std,17.810109,18.649397,23.490424
min,49.0,41.0,41.0
50%,67.0,79.0,69.0
65%,71.8,80.8,81.0
85%,83.4,82.4,91.0
max,96.0,83.0,94.0


In [69]:
scores.dtypes

Python    int64
Java      int64
H5        int64
dtype: object

In [72]:
scores.loc['A', 'Java'] = None
scores

Unnamed: 0,Python,Java,H5
A,67,,41
B,49,57.0,89
C,75,82.0,69
D,59,83.0,49
E,96,41.0,94


In [74]:
scores.dtypes

Python      int64
Java      float64
H5          int64
dtype: object

In [73]:
scores.describe(include=np.int64) # 指定统计计算的数据类型

Unnamed: 0,Python,H5
count,5.0,5.0
mean,69.2,68.4
std,17.810109,23.490424
min,49.0,41.0
25%,59.0,49.0
50%,67.0,69.0
75%,75.0,89.0
max,96.0,94.0


In [76]:
scores.any(axis=1)

A    True
B    True
C    True
D    True
E    True
dtype: bool

练习：
新建一个形状为10000*3的标准正态分布的DataFrame(np.random.randn)，去除掉所有满足以下情况的行：其中任一元素绝对值大于3倍标准差

In [80]:
d = DataFrame(np.random.randn(100, 3), columns=('A', 'B', 'C'))
d

Unnamed: 0,A,B,C
0,-1.785916,1.106939,-0.066979
1,0.324417,-0.947283,0.806774
2,-1.211161,2.198465,0.948723
3,0.181556,-0.006479,0.221150
4,-0.554065,0.161081,-0.450443
5,0.463362,-1.602237,0.705987
6,1.373713,-0.953390,0.996821
7,-1.025239,-0.543391,-0.609792
8,-0.723522,-0.956526,0.199162
9,1.089611,-0.135419,0.393518


In [95]:
drop_index = (np.abs(d) > 3*d.std(axis=0)).any(axis=1)
drop_index

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22    False
23    False
24    False
25    False
26    False
27    False
28    False
29    False
      ...  
70    False
71    False
72    False
73    False
74    False
75    False
76    False
77    False
78    False
79    False
80    False
81    False
82    False
83    False
84    False
85    False
86    False
87    False
88    False
89    False
90    False
91    False
92    False
93    False
94    False
95    False
96    False
97    False
98    False
99    False
Length: 100, dtype: bool

In [98]:
d.loc[drop_index] # 要删除的行数据

Unnamed: 0,A,B,C
57,-1.499478,-3.412525,0.230502


In [97]:
d.drop(index=d.loc[drop_index].index)

Unnamed: 0,A,B,C
0,-1.785916,1.106939,-0.066979
1,0.324417,-0.947283,0.806774
2,-1.211161,2.198465,0.948723
3,0.181556,-0.006479,0.221150
4,-0.554065,0.161081,-0.450443
5,0.463362,-1.602237,0.705987
6,1.373713,-0.953390,0.996821
7,-1.025239,-0.543391,-0.609792
8,-0.723522,-0.956526,0.199162
9,1.089611,-0.135419,0.393518


In [100]:
dd = DataFrame(np.random.randint(1, 10, size=(5, 3)), 
              columns=('A', 'B', 'C'))
dd

Unnamed: 0,A,B,C
0,4,9,6
1,7,3,7
2,7,1,6
3,1,4,6
4,9,7,8


In [102]:
3*dd.std(axis=1)  # 返回一个series, 行标签是dd的行标签

0    7.549834
1    6.928203
2    9.643651
3    7.549834
4    3.000000
dtype: float64

In [109]:
# 使用DataFrame的关系运算函数 gt(other, axis=0) 
# 从dd中删除某一行中每一个元素的值大于这一行的所有数值标准差的3倍的行
dd.drop(index=dd[dd.gt(3*dd.std(axis=1), axis=0)
                   .any(axis=1)].index)

Unnamed: 0,A,B,C
2,7,1,6
3,1,4,6


### 排列
- df.take() 按索引位置提取行或列的数据

In [112]:
# axis=0 按行索引
dd.take([0, 3, 4],axis=0) # 获取指定索引的数据

Unnamed: 0,A,B,C
0,4,9,6
3,1,4,6
4,9,7,8


In [114]:
# axis=1 表示按列索引位置获取数据
dd.take([2, 0], axis=1)

Unnamed: 0,C,A
0,6,4
1,7,7
2,6,7
3,6,1
4,8,9


In [116]:
np.random.permutation(dd.index)  # 行索引随机排列

array([1, 4, 0, 2, 3])

In [118]:
dd.take(np.random.permutation(dd.index))

Unnamed: 0,A,B,C
2,7,1,6
3,1,4,6
4,9,7,8
0,4,9,6
1,7,3,7


In [126]:
dd[np.random.permutation(dd.columns)]

Unnamed: 0,B,A,C
0,9,4,6
1,3,7,7
2,1,7,6
3,4,1,6
4,7,9,8


### 数据分类处理
- groupby()

In [128]:
df = DataFrame({
    'item': np.random.choice(['萝卜','白菜','辣椒','冬瓜'], size=20),
    'color': np.random.choice(['白','青','红'], size=20),
    'weight': np.random.randint(1, 5, size=20),
    'price':  np.round(np.random.uniform(1, 5, size=20), 1)
})
df

Unnamed: 0,item,color,weight,price
0,白菜,青,4,3.6
1,辣椒,红,2,2.5
2,白菜,红,2,3.2
3,冬瓜,红,1,2.7
4,辣椒,青,1,4.1
5,白菜,红,3,4.7
6,冬瓜,红,1,3.4
7,萝卜,红,1,1.6
8,白菜,白,3,3.7
9,辣椒,白,2,4.9


In [133]:
df.groupby('color') # 返回 DataFrameGroupBy类对象

<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x10ef906d8>

In [131]:
# 对df进行聚合操作，求出颜色为白色的价格总和
df.groupby('color')['price'].sum() # 返回Series

color
白    24.1
红    26.6
青    13.8
Name: price, dtype: float64

In [132]:
df.groupby('color')['price'].sum()['白']

24.1

In [134]:
# 对df进行聚合操作，求出萝卜的所有重量(包括白萝卜，胡萝卜，青萝卜）以及平均价格
df.groupby('item')['weight'].sum()['萝卜']

3

In [135]:
df.groupby('item')['price'].mean()['萝卜']

2.9

In [145]:
def weight_price(item):
    # item -> Series表示每个品种的weight或price
    if item.name == 'weight':
        return item.sum()
    else:
        return item.mean() # price

In [188]:
# item -> Series对象
# transform() 返回的行索引与原表的行索引保持一致的
df2_w_p = df.groupby('item')['weight', 'price'].transform(weight_price)
display(df2_w_p)
df2_w_p.rename(columns={'weight': 'weight_total',
                       'price': 'price_mean'}, inplace=True)
display(df2_w_p)

Unnamed: 0,weight,price
0,14.0,3.94
1,21.0,2.975
2,14.0,3.94
3,6.0,3.075
4,21.0,2.975
5,14.0,3.94
6,6.0,3.075
7,3.0,2.9
8,14.0,3.94
9,21.0,2.975


Unnamed: 0,weight_total,price_mean
0,14.0,3.94
1,21.0,2.975
2,14.0,3.94
3,6.0,3.075
4,21.0,2.975
5,14.0,3.94
6,6.0,3.075
7,3.0,2.9
8,14.0,3.94
9,21.0,2.975


In [154]:
pd.concat((df, df2_w_p), axis=1)

Unnamed: 0,item,color,weight,price,weight_total,price_mean
0,白菜,青,4,3.6,14.0,3.94
1,辣椒,红,2,2.5,21.0,2.975
2,白菜,红,2,3.2,14.0,3.94
3,冬瓜,红,1,2.7,6.0,3.075
4,辣椒,青,1,4.1,21.0,2.975
5,白菜,红,3,4.7,14.0,3.94
6,冬瓜,红,1,3.4,6.0,3.075
7,萝卜,红,1,1.6,3.0,2.9
8,白菜,白,3,3.7,14.0,3.94
9,辣椒,白,2,4.9,21.0,2.975


In [180]:
def weight_price_2(item):
    # item -> DataFrame类型
    rs = DataFrame({'weight_total':item['weight'].sum(),
                   'price_mean': round(item['price'].mean(),2)},
                   index=['']) # 必须指定行索引 
    # return item['weight'].sum(), round(item['price'].mean(),2)
    return rs

In [186]:
# 返回的结果，默认情况是Series， 索引跟分组的品种保持一致
df.groupby('item')['weight', 'price'].apply(weight_price_2)

Unnamed: 0_level_0,Unnamed: 1_level_0,weight_total,price_mean
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
冬瓜,,6,3.07
白菜,,14,3.94
萝卜,,3,2.9
辣椒,,21,2.98


In [187]:
df.groupby('item')['weight', 'price'].apply(weight_price_2).loc['萝卜']

Unnamed: 0,weight_total,price_mean
,3,2.9
