In [1]:
# pivot_table就是转换各个维度去观察数据, aggfunc就是在数据转换过程中的过程函数
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame({'student': ['小红', '小红', '李华', '李华', '小天', '小天'],
                    'class': ['001','001','001','001','002','002'],
                   'subject': ['C', 'Java', 'Python', 'C', 'C', 'Python'],
                   'grades': [80,  90, 78, 90, 80, 78]})

In [3]:
df

Unnamed: 0,student,class,subject,grades
0,小红,1,C,80
1,小红,1,Java,90
2,李华,1,Python,78
3,李华,1,C,90
4,小天,2,C,80
5,小天,2,Python,78


In [4]:
# pivot_table 语法
?df.pivot_table

In [5]:
# 获取每一门课程的平均分。
# subject , aggfunc="mean"
df.pivot_table(index="subject",aggfunc="mean")

Unnamed: 0_level_0,grades
subject,Unnamed: 1_level_1
C,83.333333
Java,90.0
Python,78.0


In [6]:
df.pivot_table(columns="subject")

subject,C,Java,Python
grades,83.333333,90.0,78.0


In [7]:
# 得出结论：
# index 作为纵轴索引
# columns 作为横轴索引
# 最终观察的时 value的值
# 为什么不是 student,class，而是 grades

In [8]:
df.dtypes

student    object
class      object
subject    object
grades      int64
dtype: object

In [9]:
df.pivot_table(index="subject",aggfunc=lambda x: type(x))

Unnamed: 0_level_0,class,grades,student
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C,<class 'pandas.core.frame.DataFrame'>,<class 'pandas.core.frame.DataFrame'>,<class 'pandas.core.frame.DataFrame'>
Java,<class 'pandas.core.frame.DataFrame'>,<class 'pandas.core.frame.DataFrame'>,<class 'pandas.core.frame.DataFrame'>
Python,<class 'pandas.core.frame.DataFrame'>,<class 'pandas.core.frame.DataFrame'>,<class 'pandas.core.frame.DataFrame'>


In [10]:
# tolist() 转换成列表
df.pivot_table(index="subject", aggfunc=lambda x: x.tolist())

Unnamed: 0_level_0,class,grades,student
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C,"[001, 001, 002]","[80, 90, 80]","[小红, 李华, 小天]"
Java,[001],[90],[小红]
Python,"[001, 002]","[78, 78]","[李华, 小天]"


In [11]:
# 统计各班级的平均分？
df.pivot_table(index="class")

Unnamed: 0_level_0,grades
class,Unnamed: 1_level_1
1,84.5
2,79.0


In [12]:
# reset_index() 重置索引
df.pivot_table(index="class").reset_index()

Unnamed: 0,class,grades
0,1,84.5
1,2,79.0


In [13]:
# drop=True 删除class索引号
df.pivot_table(index="class").reset_index(drop=True)

Unnamed: 0,grades
0,84.5
1,79.0


In [14]:
# 统计各班级平均分及班级学生人数？
# df.student.nunique()      aggfunc=mean
# x.nunique() = df.class.nunique() = len(df.class.unique())
df.pivot_table(index="class", aggfunc={"grades": np.mean ,"student":lambda x:x.nunique()})

Unnamed: 0_level_0,grades,student
class,Unnamed: 1_level_1,Unnamed: 2_level_1
1,84.5,2
2,79.0,1


In [15]:
df.pivot_table(index="class", aggfunc={"grades": np.mean ,"student":lambda x:len(x.unique())})

Unnamed: 0_level_0,grades,student
class,Unnamed: 1_level_1,Unnamed: 2_level_1
1,84.5,2
2,79.0,1


In [18]:
# (3)统计各班级及各科目的平均分？
# class, subject
df.pivot_table(index="class", columns="subject")

Unnamed: 0_level_0,grades,grades,grades
subject,C,Java,Python
class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,85.0,90.0,78.0
2,80.0,,78.0


In [20]:
# （4）统计各个班级(class)的各个科目(subject)的最高分(空值填充为0)
# fill_value 默认将空值填充为0 
df.pivot_table(index="class", columns="subject", values="grades",aggfunc=max, fill_value=0)

subject,C,Java,Python
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,90,90,78
2,80,0,78


In [21]:
# (5) 统计各个班级的各个科目人数（空值填充0）
df.pivot_table(index="class", columns="subject", values="grades",aggfunc='count', fill_value=0)



subject,C,Java,Python
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2,1,1
2,1,0,1


In [23]:
# (6) 统计各个学生的最高分，最低分，平均分
df.pivot_table(index="student",values="grades",aggfunc=[max,min,np.mean])

Unnamed: 0_level_0,max,min,mean
Unnamed: 0_level_1,grades,grades,grades
student,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
小天,80,78,79
小红,90,80,85
李华,90,78,84


In [27]:
# (7) 统计每个学生的所有科目最高分，最低分，平均分
df.pivot_table(index="student",columns="subject", values="grades",aggfunc=[max,min,np.mean],fill_value=0)


Unnamed: 0_level_0,max,max,max,min,min,min,mean,mean,mean
subject,C,Java,Python,C,Java,Python,C,Java,Python
student,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
小天,80,0,78,80,0,78,80,0,78
小红,80,90,0,80,90,0,80,90,0
李华,90,0,78,90,0,78,90,0,78


In [28]:
# stack() 行转列
df.pivot_table(index="student",values="grades",aggfunc=[max,min,np.mean]).stack()

Unnamed: 0_level_0,Unnamed: 1_level_0,max,min,mean
student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
小天,grades,80,78,79
小红,grades,90,80,85
李华,grades,90,78,84


In [31]:
# reset_index() 表示索引列显示的位置
df.pivot_table(index="student",values="grades",aggfunc=[max,min,np.mean]).stack().reset_index(level=1,drop=True)

Unnamed: 0_level_0,max,min,mean
student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
小天,80,78,79
小红,90,80,85
李华,90,78,84
