#  pandas之DataFrame数据帧

In [1]:
import pandas as pd
import numpy as np

In [2]:
np.random.seed(123)
data = np.random.randint(1, 100, (6, 4))
row_index = np.arange(2001, 2007).astype(str)
col_index = ['语文', '数学', '英语', '物理']
data

array([[67, 93, 99, 18],
       [84, 58, 87, 98],
       [97, 48, 74, 33],
       [47, 97, 26, 84],
       [79, 37, 97, 81],
       [69, 50, 56, 68]])

In [3]:
# 创建 DateFrame
df1 = pd.DataFrame(
    data,  # 二维数据
    index=row_index,  # 行索引标签
    columns=col_index  # 列标签
)
df1

Unnamed: 0,语文,数学,英语,物理
2001,67,93,99,18
2002,84,58,87,98
2003,97,48,74,33
2004,47,97,26,84
2005,79,37,97,81
2006,69,50,56,68


In [4]:
print('df1的形状',df1.shape)
print('df1的维度',df1.ndim)
print('df1的行索引',df1.index)
print('df1的列名称',df1.columns)
print('df1的值\n',df1.values)
print('df1的数据类型:\n',df1.dtypes)

df1的形状 (6, 4)
df1的维度 2
df1的行索引 Index(['2001', '2002', '2003', '2004', '2005', '2006'], dtype='object')
df1的列名称 Index(['语文', '数学', '英语', '物理'], dtype='object')
df1的值
 [[67 93 99 18]
 [84 58 87 98]
 [97 48 74 33]
 [47 97 26 84]
 [79 37 97 81]
 [69 50 56 68]]
df1的数据类型:
 语文    int32
数学    int32
英语    int32
物理    int32
dtype: object


In [5]:
type(df1.values)

numpy.ndarray

In [6]:
# 增删查改

In [7]:
# 查找数据-单列
df1['语文']
df1.语文
# 查找多列数据
df1[['语文','数学']]

Unnamed: 0,语文,数学
2001,67,93
2002,84,58
2003,97,48
2004,47,97
2005,79,37
2006,69,50


In [8]:
type(df1.语文)

pandas.core.series.Series

In [9]:
df1
# 新增一列 
np.random.seed(123)
df1['化学']=np.random.randint(1,100,6)
df1

Unnamed: 0,语文,数学,英语,物理,化学
2001,67,93,99,18,67
2002,84,58,87,98,93
2003,97,48,74,33,99
2004,47,97,26,84,18
2005,79,37,97,81,84
2006,69,50,56,68,58


In [10]:
# 删除一列
del df1['化学']

In [11]:
df1

Unnamed: 0,语文,数学,英语,物理
2001,67,93,99,18
2002,84,58,87,98
2003,97,48,74,33
2004,47,97,26,84
2005,79,37,97,81
2006,69,50,56,68


In [12]:
# 对行的操作
# df1[开始行索引:结束行索引:步长]
# 前两行
df1[0:2:1]

Unnamed: 0,语文,数学,英语,物理
2001,67,93,99,18
2002,84,58,87,98


In [13]:
df1
# 前两行 前两列 根据索引位置取 前闭后开
df1.iloc[0:2,0:2]

# 前两行 前两列 根据标签名称取 都能取到
df1.loc['2001':'2002','语文':'数学']

Unnamed: 0,语文,数学
2001,67,93
2002,84,58


In [14]:
df1
# 2001和2003    语文和英语
df1.iloc[0:3:2,0:3:2]
df1.loc['2001':'2003':2,'语文':'英语':2]
df1.loc[['2001','2003'],['语文','英语']]

Unnamed: 0,语文,英语
2001,67,99
2003,97,74


In [15]:
df1.iloc[[0,2],[0,2]]

Unnamed: 0,语文,英语
2001,67,99
2003,97,74


In [16]:
# 新增一行
np.random.seed(123)
df1.loc['2007']=np.random.randint(1,100,4)
df1
# 新增一列  计算语文+数学
df1['语数成绩']=df1['语文']+df1['数学']
df1

Unnamed: 0,语文,数学,英语,物理,语数成绩
2001,67,93,99,18,160
2002,84,58,87,98,142
2003,97,48,74,33,145
2004,47,97,26,84,144
2005,79,37,97,81,116
2006,69,50,56,68,119
2007,67,93,99,18,160


In [17]:
df1

Unnamed: 0,语文,数学,英语,物理,语数成绩
2001,67,93,99,18,160
2002,84,58,87,98,142
2003,97,48,74,33,145
2004,47,97,26,84,144
2005,79,37,97,81,116
2006,69,50,56,68,119
2007,67,93,99,18,160


1. 得到2002,2004 英语 物理的成绩数据
2. 所有数学成绩 都加十分
3. 修改2005的英语成绩 为89分

In [18]:
df1
# 1. 得到2002,2004 英语 物理的成绩数据
df1.iloc[[1,3],[2,3]]
df1.loc[['2002','2004'],['英语','物理']]

Unnamed: 0,英语,物理
2002,87,98
2004,26,84


In [19]:
# df1.loc[:,'数学']

In [20]:
# 2. 所有数学成绩 都加十分
df1['数学']=df1['数学']+10           
# df1

In [21]:
# 3. 修改2005的英语成绩 为89分
df1.loc['2005','英语']=89
df1

Unnamed: 0,语文,数学,英语,物理,语数成绩
2001,67,103,99,18,160
2002,84,68,87,98,142
2003,97,58,74,33,145
2004,47,107,26,84,144
2005,79,47,89,81,116
2006,69,60,56,68,119
2007,67,103,99,18,160


In [22]:
df1.drop(labels='2007',axis=0,inplace=True)   

In [23]:
df1

Unnamed: 0,语文,数学,英语,物理,语数成绩
2001,67,103,99,18,160
2002,84,68,87,98,142
2003,97,58,74,33,145
2004,47,107,26,84,144
2005,79,47,89,81,116
2006,69,60,56,68,119


In [24]:
# df1['数学']>60

In [25]:
df1
# 得到数学及格的信息
df1[df1['数学']>60]

Unnamed: 0,语文,数学,英语,物理,语数成绩
2001,67,103,99,18,160
2002,84,68,87,98,142
2004,47,107,26,84,144


得到语文成绩>80 数学>85的数据

In [26]:
# 得到语文成绩>80 数学>85的数据
df1[(df1['语文']>80)&(df1['数学']>85)]

Unnamed: 0,语文,数学,英语,物理,语数成绩


In [27]:
df1

Unnamed: 0,语文,数学,英语,物理,语数成绩
2001,67,103,99,18,160
2002,84,68,87,98,142
2003,97,58,74,33,145
2004,47,107,26,84,144
2005,79,47,89,81,116
2006,69,60,56,68,119


In [28]:
# 访问前几行 默认是5,括号里面写默认行数 从前往后
df1.head(2)

Unnamed: 0,语文,数学,英语,物理,语数成绩
2001,67,103,99,18,160
2002,84,68,87,98,142


In [29]:
# 访问后几行 默认是5 括号里面写默认行数 从后往前
df1.tail()

Unnamed: 0,语文,数学,英语,物理,语数成绩
2002,84,68,87,98,142
2003,97,58,74,33,145
2004,47,107,26,84,144
2005,79,47,89,81,116
2006,69,60,56,68,119


In [40]:
df1
# 删除单列 多列 
# inplace:默认false 默认不在源数据上操作
df1.drop(labels='语文',axis=1,inplace=True)

In [41]:
df1

Unnamed: 0,数学,英语,物理,语数成绩
2001,103,99,18,160
2002,68,87,98,142
2003,58,74,33,145
2004,107,26,84,144
2005,47,89,81,116
2006,60,56,68,119


In [32]:
# 删除多列 删除英语 物理 
df1.drop(labels=['英语','物理'],axis=1)

Unnamed: 0,语文,数学,语数成绩
2001,67,103,160
2002,84,68,142
2003,97,58,145
2004,47,107,144
2005,79,47,116
2006,69,60,119


In [33]:
df1

Unnamed: 0,语文,数学,英语,物理,语数成绩
2001,67,103,99,18,160
2002,84,68,87,98,142
2003,97,58,74,33,145
2004,47,107,26,84,144
2005,79,47,89,81,116
2006,69,60,56,68,119


In [83]:
# 排序 sort_index False默认降序 axis=0 行 1代表列
df1.sort_index(axis=1)

Unnamed: 0,数学,物理,英语,语文
2001,40,88,99,62
2002,88,44,87,68
2003,75,87,38,60
2004,106,31,28,99
2005,69,85,89,54
2006,100,88,33,51


In [78]:
df1.sort_values(by='数学',ascending=False)

Unnamed: 0,语文,数学,英语,物理
2004,99,106,28,31
2006,51,100,33,88
2002,68,88,87,44
2003,60,75,38,87
2005,54,69,89,85
2001,62,40,99,88
