### 数据列表(按行填充)：用list嵌套list
- 是一个横向填充的表格，和表格格式一致
- 列名由columns指定，**宽度必须与表格最长列一致**
- 索引默认填充0、1

In [15]:
import pandas as pd
pd.options.display.max_rows = 10

colname = []
lst = []
for i in range(0, 10):
    colname.append("m" + str(i))
    tmplst = []
    for j in range(0, 10):
        tmplst.append(j+i)
    lst.append(tmplst)

df1 = pd.DataFrame(lst, columns=colname)
df1

Unnamed: 0,m0,m1,m2,m3,m4,m5,m6,m7,m8,m9
0,0,1,2,3,4,5,6,7,8,9
1,1,2,3,4,5,6,7,8,9,10
2,2,3,4,5,6,7,8,9,10,11
3,3,4,5,6,7,8,9,10,11,12
4,4,5,6,7,8,9,10,11,12,13
5,5,6,7,8,9,10,11,12,13,14
6,6,7,8,9,10,11,12,13,14,15
7,7,8,9,10,11,12,13,14,15,16
8,8,9,10,11,12,13,14,15,16,17
9,9,10,11,12,13,14,15,16,17,18


## 字典格式(按列填充)：用字典嵌套list
- 是一个纵向填充的表格，一个key对应一列
- 列名为字典的key
- 索引默认填充0、1、2、3

In [7]:
df2 = pd.DataFrame({
    'name' : ['a', 'b', 'c', 'd'],
    'id': [1, 2, 3, 4], # 用列表，所有列表宽度必须一致
    'custom': "hello"   # 用基础类型，所有行是同一个value，自动填充
})
df2

Unnamed: 0,name,id,custom
0,a,1,hello
1,b,2,hello
2,c,3,hello
3,d,4,hello


## 基础

In [19]:
tmpdf = df1.copy()

# shape (line_cnt, col_cnt)
shp = tmpdf.shape
print(shp, type(shp), end='\n---\n')

# 列
clms = tmpdf.columns
print(clms, type(clms), end='\n---\n')

# 每一列的类型
tps = tmpdf.dtypes
print(tps, type(tps), end='\n---\n')

# 索引、数据类型和内存信息
info = tmpdf.info()
print(info, type(info), end='\n---\n')

# 数据做基本的描述统计：数量、均值、标准差、最小值、25%值、中位数、75%值、最大值
# 还可以round
des = tmpdf.describe().round(1)
print(des, type(des), end='\n---\n')

(10, 10) <class 'tuple'>
---
Index(['m0', 'm1', 'm2', 'm3', 'm4', 'm5', 'm6', 'm7', 'm8', 'm9'], dtype='object') <class 'pandas.core.indexes.base.Index'>
---
m0    int64
m1    int64
m2    int64
m3    int64
m4    int64
m5    int64
m6    int64
m7    int64
m8    int64
m9    int64
dtype: object <class 'pandas.core.series.Series'>
---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   m0      10 non-null     int64
 1   m1      10 non-null     int64
 2   m2      10 non-null     int64
 3   m3      10 non-null     int64
 4   m4      10 non-null     int64
 5   m5      10 non-null     int64
 6   m6      10 non-null     int64
 7   m7      10 non-null     int64
 8   m8      10 non-null     int64
 9   m9      10 non-null     int64
dtypes: int64(10)
memory usage: 928.0 bytes
None <class 'NoneType'>
---
         m0    m1    m2    m3    m4    m5    m6    m7    m8    m9
count  1

## 修改列名
- 默认返回一个新的df，不在原df上修改(inplace=True则在原df修改)

In [20]:
# 修改columns，返回新df
tmpdf = df1.rename(columns={'m0':'x0', 'm1':'x1'})
tmpdf

# 修改index，返回新df
tmpdf = df1.rename(index={0:'x0', 1:'x1'})
tmpdf

Unnamed: 0,m0,m1,m2,m3,m4,m5,m6,m7,m8,m9
x0,0,1,2,3,4,5,6,7,8,9
x1,1,2,3,4,5,6,7,8,9,10
2,2,3,4,5,6,7,8,9,10,11
3,3,4,5,6,7,8,9,10,11,12
4,4,5,6,7,8,9,10,11,12,13
5,5,6,7,8,9,10,11,12,13,14
6,6,7,8,9,10,11,12,13,14,15
7,7,8,9,10,11,12,13,14,15,16
8,8,9,10,11,12,13,14,15,16,17
9,9,10,11,12,13,14,15,16,17,18


### pd.Series ：带有索引的list
- 索引为0、1、2、3
- DataFrame的一列就是Series

In [21]:
s1 = pd.Series(['a', 'b', 'c', 'c'])
s2 = df1['m1']
s2 = df1.m1 # 两者都可以
print(s1, s2, sep='\n')
print(type(s1), type(s2))

0    a
1    b
2    c
3    c
dtype: object
0     1
1     2
2     3
3     4
4     5
5     6
6     7
7     8
8     9
9    10
Name: m1, dtype: int64
<class 'pandas.core.series.Series'> <class 'pandas.core.series.Series'>


## 列操作

In [None]:
# 选择m1列
print(df1['m1'], end='\n---\n')

# 设置m1列
tmpdf = df1.copy()
tmpdf['m1'] = pd.Series([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
tmpdf

: 

## 行操作

In [23]:
# 选择索引为4的行
print(df1.loc[4], end='\n---\n')

# 设置索引为4的行
tmpdf = df1.copy()
tmpdf.loc[4] = 111
print(tmpdf, end='\n---\n')

tmpdf = df1.copy()
tmpdf.loc[4] = {'m0':100, 'm1':101, 'm2':102} # 字典方式
print(tmpdf, end='\n---\n')

# 选择m1列大于8的行
print(df1[df1['m1'] > 8], end='\n---\n')
print(df1.loc[df1['m1'] > 8], end='\n---\n')

# 选择m1列大于8的行，将m2列设置为99999
tmpdf = df1.copy()
tmpdf.loc[df1['m1'] > 8, 'm2'] = 99999
tmpdf

m0     4
m1     5
m2     6
m3     7
m4     8
m5     9
m6    10
m7    11
m8    12
m9    13
Name: 4, dtype: int64
---
    m0   m1   m2   m3   m4   m5   m6   m7   m8   m9
0    0    1    2    3    4    5    6    7    8    9
1    1    2    3    4    5    6    7    8    9   10
2    2    3    4    5    6    7    8    9   10   11
3    3    4    5    6    7    8    9   10   11   12
4  111  111  111  111  111  111  111  111  111  111
5    5    6    7    8    9   10   11   12   13   14
6    6    7    8    9   10   11   12   13   14   15
7    7    8    9   10   11   12   13   14   15   16
8    8    9   10   11   12   13   14   15   16   17
9    9   10   11   12   13   14   15   16   17   18
---
      m0     m1     m2    m3    m4    m5    m6    m7    m8    m9
0    0.0    1.0    2.0   3.0   4.0   5.0   6.0   7.0   8.0   9.0
1    1.0    2.0    3.0   4.0   5.0   6.0   7.0   8.0   9.0  10.0
2    2.0    3.0    4.0   5.0   6.0   7.0   8.0   9.0  10.0  11.0
3    3.0    4.0    5.0   6.0   7.0   8.0   9.0  

Unnamed: 0,m0,m1,m2,m3,m4,m5,m6,m7,m8,m9
0,0,1,2,3,4,5,6,7,8,9
1,1,2,3,4,5,6,7,8,9,10
2,2,3,4,5,6,7,8,9,10,11
3,3,4,5,6,7,8,9,10,11,12
4,4,5,6,7,8,9,10,11,12,13
5,5,6,7,8,9,10,11,12,13,14
6,6,7,8,9,10,11,12,13,14,15
7,7,8,9,10,11,12,13,14,15,16
8,8,9,99999,11,12,13,14,15,16,17
9,9,10,99999,12,13,14,15,16,17,18


## 删除

In [10]:
# 删除空值
# df.dropna() # 一行中有一个缺失值就删除
# df.dropna(axis='columns') # 只保留全有值的列
# df.dropna(how='all') # 行或列全没值才删除
# df.dropna(thresh=2) # 至少有两个空值时才删除
# df.dropna(inplace=True) # 删除并使替换生效

# 删除行，索引为3的行，返回新df
tmpdf = df1.drop(3)

# # 删除列，列名为m0的列，返回新df
tmpdf = tmpdf.drop('m0', axis=1)
tmpdf

Unnamed: 0,m1,m2,m3,m4,m5,m6,m7,m8,m9
0,1,2,3,4,5,6,7,8,9
1,2,3,4,5,6,7,8,9,10
2,3,4,5,6,7,8,9,10,11
4,5,6,7,8,9,10,11,12,13
5,6,7,8,9,10,11,12,13,14
6,7,8,9,10,11,12,13,14,15
7,8,9,10,11,12,13,14,15,16
8,9,10,11,12,13,14,15,16,17
9,10,11,12,13,14,15,16,17,18


## 抽样
- n 数量
- replace 是否有放回抽样
- weights 样本权重，参数为字符串为数组
- random_state 是否重复，1表示会取得重复数据
- axis 表示在哪个方向上抽取数据(axis=1 表示列/axis=0 表示行)

In [11]:
# 抽样，返回新df
tmpdf3 = df1.sample(n=3)
tmpdf3

Unnamed: 0,m0,m1,m2,m3,m4,m5,m6,m7,m8,m9
1,1,2,3,4,5,6,7,8,9,10
2,2,3,4,5,6,7,8,9,10,11
5,5,6,7,8,9,10,11,12,13,14


## 合并

In [12]:
tmpdf3 = df1.sample(n=3)
tmpdf4 = df1.sample(n=3)

# 返回新df
tmpdf = pd.concat([tmpdf3, tmpdf4])
tmpdf

Unnamed: 0,m0,m1,m2,m3,m4,m5,m6,m7,m8,m9
6,6,7,8,9,10,11,12,13,14,15
8,8,9,10,11,12,13,14,15,16,17
7,7,8,9,10,11,12,13,14,15,16
0,0,1,2,3,4,5,6,7,8,9
4,4,5,6,7,8,9,10,11,12,13
3,3,4,5,6,7,8,9,10,11,12


## 排序

In [13]:
import random as rd
colname = []
lst = []
for i in range(0, 10):
    tmplst = []
    for j in range(0, 10):
        tmplst.append(rd.randint(1, 7))
    lst.append(tmplst)
for i in range(0, 10):
    colname.append("m" + str(i))

# 10*10随机值
df3 = pd.DataFrame(lst, columns=colname)

# m0列排序，降序
tmpdf = df3.sort_values(by='m0', ascending=False)
print(tmpdf, end='\n---\n')

# m0升序，m0相同时m1降序
tmpdf = df3.sort_values(['m0', 'm1'], ascending=[True, False])

# 重置索引，从0开始，返回新df
# 原来的索引默认被添加一列index，drop=True会丢弃这一列
tmpdf = tmpdf.reset_index(drop=True)
print(tmpdf, end='\n---\n')

   m0  m1  m2  m3  m4  m5  m6  m7  m8  m9
0   7   7   4   5   4   2   3   6   6   2
6   6   6   1   2   5   7   1   7   7   5
1   5   4   1   6   7   1   3   4   3   6
3   5   2   6   3   6   2   5   2   1   2
7   5   4   3   5   2   2   5   6   3   1
8   5   3   5   2   1   5   7   2   2   6
4   2   2   7   5   5   2   1   5   4   7
5   2   5   5   5   6   2   3   3   3   5
9   2   6   7   3   1   2   1   3   1   3
2   1   7   7   2   6   3   1   3   2   1
---
   m0  m1  m2  m3  m4  m5  m6  m7  m8  m9
0   1   7   7   2   6   3   1   3   2   1
1   2   6   7   3   1   2   1   3   1   3
2   2   5   5   5   6   2   3   3   3   5
3   2   2   7   5   5   2   1   5   4   7
4   5   4   1   6   7   1   3   4   3   6
5   5   4   3   5   2   2   5   6   3   1
6   5   3   5   2   1   5   7   2   2   6
7   5   2   6   3   6   2   5   2   1   2
8   6   6   1   2   5   7   1   7   7   5
9   7   7   4   5   4   2   3   6   6   2
---


## 聚合

In [14]:
tmpdf = df3.copy()

# 返回聚合后的df
# 聚合m0，输出m1的均值
res = tmpdf.groupby('m0')['m1'].mean()
print(res.index, res, type(res), end='\n---\n')
# 重置索引（聚合后索引为m0，现在增加一列新索引，m0变为数据列）
res = res.reset_index()
print(res, type(res), end='\n---\n')

# 聚合m0，输出m1的多项聚合值
res = tmpdf.groupby('m0')['m1'].agg(['min', 'max', 'mean']).round(1).reset_index()
print(res, type(res), end='\n---\n')

# 聚合m0，快速得出描述统计结果
res = tmpdf.groupby('m0')['m1'].describe().round(2).reset_index()
print(res, type(res), end='\n---\n')

Index([1, 2, 5, 6, 7], dtype='int64', name='m0') m0
1    7.000000
2    4.333333
5    3.250000
6    6.000000
7    7.000000
Name: m1, dtype: float64 <class 'pandas.core.series.Series'>
---
   m0        m1
0   1  7.000000
1   2  4.333333
2   5  3.250000
3   6  6.000000
4   7  7.000000 <class 'pandas.core.frame.DataFrame'>
---
   m0  min  max  mean
0   1    7    7   7.0
1   2    2    6   4.3
2   5    2    4   3.2
3   6    6    6   6.0
4   7    7    7   7.0 <class 'pandas.core.frame.DataFrame'>
---
   m0  count  mean   std  min   25%  50%  75%  max
0   1    1.0  7.00   NaN  7.0  7.00  7.0  7.0  7.0
1   2    3.0  4.33  2.08  2.0  3.50  5.0  5.5  6.0
2   5    4.0  3.25  0.96  2.0  2.75  3.5  4.0  4.0
3   6    1.0  6.00   NaN  6.0  6.00  6.0  6.0  6.0
4   7    1.0  7.00   NaN  7.0  7.00  7.0  7.0  7.0 <class 'pandas.core.frame.DataFrame'>
---
