## Pandas 数据结构 - Series

In [1]:
# 安装
pip install pandas -i https://pypi.tuna.tsinghua.edu.cn/simple

In [2]:
import pandas as pd

### 使用列表数据创建：pd.Series([])

In [6]:
l = [1, 2, 3]

In [7]:
# 类似表格一列或一维数组，由索引和列组成
df = pd.Series(l)

In [8]:
df

0    1
1    2
2    3
dtype: int64

In [9]:
# 根据索引获取元素
df[:2]

0    1
1    2
dtype: int64

### 自定义索引：pd.Series(index=[])

In [10]:
l = ['Google', 'Baidu', 'Wiki']

In [12]:
df = pd.Series(l, index=['g', 'b', 'w'])

In [13]:
df

g    Google
b     Baidu
w      Wiki
dtype: object

In [14]:
df['b']

'Baidu'

### 使用字典数据创建：pd.Series({})

In [15]:
d = {1: 'Google', 2: 'Baidu', 3: 'Wiki'}

In [16]:
df = pd.Series(d)

In [17]:
df

1    Google
2     Baidu
3      Wiki
dtype: object

In [18]:
# 此时字典的键是索引，可以据此查询值
df[2]

'Baidu'

In [19]:
# 字典的键除了用来查询，还可以有选择性的创建 pd 数据
d

{1: 'Google', 2: 'Baidu', 3: 'Wiki'}

In [20]:
df = pd.Series(d, index=[2, 3])

In [21]:
df

2    Baidu
3     Wiki
dtype: object

### 设置 Seriese 名称参数：pd.Series(name='')

In [23]:
d

{1: 'Google', 2: 'Baidu', 3: 'Wiki'}

In [24]:
df = pd.Series(d, name='Pandas Test')

In [25]:
df

1    Google
2     Baidu
3      Wiki
Name: Pandas Test, dtype: object

## Pandas 数据结构 - DataFrame

In [26]:
# DataFrame 是一个表格型的数据结构
# 它含有一组有序的列，每列可以是不同的值类型（数值、字符串、布尔型值）
# DataFrame 既有行索引也有列索引
# 可以看做由 Series 组成的字典（共同用一个索引）

### 使用列表列表构造：pd.DataFrame([[], []], columns=[])

In [27]:
# 列表中包含的也是多个小列表的方式
l = [['Google', 10], ['Baidu', 14], ['Wiki', 21]]

In [29]:
df = pd.DataFrame(l, columns=['Site', 'Age'])

In [30]:
df

Unnamed: 0,Site,Age
0,Google,10
1,Baidu,14
2,Wiki,21


### 使用列表字典构造：pd.DataFrame({})

In [34]:
# 字典的值是列表形式
d = {'Site': ['Google', 'Baidu', 'Wiki'], 'Age': [13, 42, 56]}

In [32]:
df = pd.DataFrame(d)

In [33]:
df

Unnamed: 0,Site,Age
0,Google,13
1,Baidu,42
2,Wiki,56


### 使用字典列表构造：pd.DataFrame([{}, {}])

In [35]:
# 列表里是多个字典
l = [{'a': 1, 'b': 2}, {'a': 5, 'b': 3, 'c': 21}]

In [36]:
df = pd.DataFrame(l)

In [37]:
# 没有对应的部分数据则自动填充为 NaN
df

Unnamed: 0,a,b,c
0,1,2,
1,5,3,21.0


### 自定义索引：pd.DataFrame(index=[])

In [56]:
d = {
  "calories": [420, 380, 390],
  "duration": [50, 40, 45]
}

In [57]:
df1 = pd.DataFrame(d, index=['day1', 'day2', 'day3'])

In [58]:
df1

Unnamed: 0,calories,duration
day1,420,50
day2,380,40
day3,390,45


### 查询 pd 指定行数据：.loc

In [38]:
d = {
  "calories": [420, 380, 390],
  "duration": [50, 40, 45]
}

In [39]:
df = pd.DataFrame(d)

In [40]:
df

Unnamed: 0,calories,duration
0,420,50
1,380,40
2,390,45


In [41]:
series_obj = df.loc[0]

In [42]:
# 返回结果其实就是一个 Pandas Series 数据
series_obj

calories    420
duration     50
Name: 0, dtype: int64

In [43]:
series_obj.calories

420

In [44]:
series_obj.duration

50

In [47]:
s = df.loc[[0, 2]]

In [48]:
s

Unnamed: 0,calories,duration
0,420,50
2,390,45


In [59]:
df1

Unnamed: 0,calories,duration
day1,420,50
day2,380,40
day3,390,45


In [60]:
s = df1.loc['day2']

In [61]:
s

calories    380
duration     40
Name: day2, dtype: int64

## Pandas 操作 CSV 文件

### 读取文件内容：pd.read_csv()

In [62]:
df = pd.read_csv('nba.csv')

In [63]:
# 输出结果为前面 5 行和末尾 5 行，中间部分以 ... 代替
df

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0


### 将读取内容转成 DataFrame 数据：df.to_string()

In [64]:
# 会展示所有的内容
df.to_string()

"                         Name                    Team  Number Position   Age Height  Weight                College      Salary\n0               Avery Bradley          Boston Celtics     0.0       PG  25.0    6-2   180.0                  Texas   7730337.0\n1                 Jae Crowder          Boston Celtics    99.0       SF  25.0    6-6   235.0              Marquette   6796117.0\n2                John Holland          Boston Celtics    30.0       SG  27.0    6-5   205.0      Boston University         NaN\n3                 R.J. Hunter          Boston Celtics    28.0       SG  22.0    6-5   185.0          Georgia State   1148640.0\n4               Jonas Jerebko          Boston Celtics     8.0       PF  29.0   6-10   231.0                    NaN   5000000.0\n5                Amir Johnson          Boston Celtics    90.0       PF  29.0    6-9   240.0                    NaN  12000000.0\n6               Jordan Mickey          Boston Celtics    55.0       PF  21.0    6-8   235.0            

### 保存文件内容：df.to_csv

In [65]:
name = ["Google", "Runoob", "Taobao", "Wiki"]
site = ["www.google.com", "www.runoob.com", "www.taobao.com", "www.wikipedia.org"]
age = [90, 40, 80, 98]
d = {'name': name, 'site': site, 'age': age}

In [66]:
df = pd.DataFrame(d)

In [67]:
df.to_csv('site.csv')

### 读取文件前 n 行数据：df.head(n)

In [68]:
df = pd.read_csv('nba.csv')

In [69]:
# 默认读取前 5 行数据
df.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0


In [70]:
# 指定读取前 n 行数据
df.head(6)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
5,Amir Johnson,Boston Celtics,90.0,PF,29.0,6-9,240.0,,12000000.0


### 读取文件后 n 行数据：df.tail(n)

In [71]:
# 默认读取最后 5 行数据
df.tail()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0
457,,,,,,,,,


In [72]:
# 指定读取最后 n 行数据

In [73]:
df.tail(2)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0
457,,,,,,,,,


### 获取表格基本信息：df.info()

In [74]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458 entries, 0 to 457
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      457 non-null    object 
 1   Team      457 non-null    object 
 2   Number    457 non-null    float64
 3   Position  457 non-null    object 
 4   Age       457 non-null    float64
 5   Height    457 non-null    object 
 6   Weight    457 non-null    float64
 7   College   373 non-null    object 
 8   Salary    446 non-null    float64
dtypes: float64(4), object(5)
memory usage: 32.3+ KB


## Pandas 操作 JSON 文件

### 读取 JSON 文件：pd.read_json()

In [75]:
df = pd.read_json('sites.json')

In [76]:
df

Unnamed: 0,id,name,url,likes
0,A001,菜鸟教程,www.runoob.com,61
1,A002,Google,www.google.com,124
2,A003,淘宝,www.taobao.com,45


In [77]:
# JSON 对象与 Python 字典具有相同的格式
# 所以可以直接将 Python 字典转化为 DataFrame 数据
d = {
    "col1":{"row1":1,"row2":2,"row3":3},
    "col2":{"row1":"x","row2":"y","row3":"z"}
}

In [78]:
df = pd.DataFrame(d)

In [79]:
df

Unnamed: 0,col1,col2
row1,1,x
row2,2,y
row3,3,z


### 读取 URL 数据：pd.read_json(url)

In [80]:
url = 'https://static.runoob.com/download/sites.json'

In [81]:
df = pd.read_json(url)

In [82]:
df

Unnamed: 0,id,name,url,likes
0,A001,菜鸟教程,www.runoob.com,61
1,A002,Google,www.google.com,124
2,A003,淘宝,www.taobao.com,45


### 读取复杂嵌套的 JSON 文件：pd.json_normalize()

In [83]:
df = pd.read_json('nested_list.json')

In [84]:
df

Unnamed: 0,school_name,class,students
0,ABC primary school,Year 1,"{'id': 'A001', 'name': 'Tom', 'math': 60, 'phy..."
1,ABC primary school,Year 1,"{'id': 'A002', 'name': 'James', 'math': 89, 'p..."
2,ABC primary school,Year 1,"{'id': 'A003', 'name': 'Jenny', 'math': 79, 'p..."


In [85]:
# 这里 students 这一列的数据比较复杂，可以提取出来
import json

In [86]:
with open('nested_list.json', 'r') as f:
    d = json.loads(f.read())

In [87]:
# 指定内嵌数据的字段名
df = pd.json_normalize(d, record_path=['students'])

In [88]:
df

Unnamed: 0,id,name,math,physics,chemistry
0,A001,Tom,60,66,61
1,A002,James,89,76,51
2,A003,Jenny,79,90,78


In [90]:
# 显示结果还没有包含 school_name 和 class 元素
# 如果需要展示出来可以使用 meta 参数来显示这些元数据
df = pd.json_normalize(
    d,
    record_path=['students'],
    meta=['school_name', 'class']
)

In [91]:
df

Unnamed: 0,id,name,math,physics,chemistry,school_name,class
0,A001,Tom,60,66,61,ABC primary school,Year 1
1,A002,James,89,76,51,ABC primary school,Year 1
2,A003,Jenny,79,90,78,ABC primary school,Year 1


In [92]:
# 读取更多内嵌数据示例
with open('nested_mix.json', 'r') as f:
    d = json.loads(f.read())

In [93]:
df = pd.json_normalize(
    d,
    record_path=['students'],
    meta=[
        'class',
        ['info', 'president'],
        ['info', 'contacts', 'tel']
    ]
)

In [94]:
df

Unnamed: 0,id,name,math,physics,chemistry,class,info.president,info.contacts.tel
0,A001,Tom,60,66,61,Year 1,John Kasich,123456789
1,A002,James,89,76,51,Year 1,John Kasich,123456789
2,A003,Jenny,79,90,78,Year 1,John Kasich,123456789


### 读取内嵌数据中的一组数据：glom

In [None]:
# 安装
pip install glom -i https://pypi.tuna.tsinghua.edu.cn/simple

In [95]:
from glom import glom

In [96]:
df = pd.read_json('nested_deep.json')

In [99]:
students_info = df['students']

In [100]:
students_info

0    {'id': 'A001', 'name': 'Tom', 'grade': {'math'...
1    {'id': 'A002', 'name': 'James', 'grade': {'mat...
2    {'id': 'A003', 'name': 'Jenny', 'grade': {'mat...
Name: students, dtype: object

In [97]:
d = df['students'].apply(lambda row: glom(row, 'grade.math'))

In [98]:
d

0    60
1    89
2    79
Name: students, dtype: int64

## Pandas 数据清洗

### 判断单元格元素是否为空值：.isnull())

In [101]:
df = pd.read_csv('property-data.csv')

In [102]:
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,NUM_BEDROOMS,NUM_BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3,1,1000
1,100002000.0,197.0,LEXINGTON,N,3,1.5,--
2,100003000.0,,LEXINGTON,N,,1,850
3,100004000.0,201.0,BERKELEY,12,1,,700
4,,203.0,BERKELEY,Y,3,2,1600
5,100006000.0,207.0,BERKELEY,Y,,1,800
6,100007000.0,,WASHINGTON,,2,HURLEY,950
7,100008000.0,213.0,TREMONT,Y,1,1,
8,100009000.0,215.0,TREMONT,Y,na,2,1800


In [103]:
test_column = df['NUM_BEDROOMS']

In [104]:
test_column

0      3
1      3
2    NaN
3      1
4      3
5    NaN
6      2
7      1
8     na
Name: NUM_BEDROOMS, dtype: object

In [105]:
test_column.isnull()

0    False
1    False
2     True
3    False
4    False
5     True
6    False
7    False
8    False
Name: NUM_BEDROOMS, dtype: bool

### 指定空值类型：na_values

In [106]:
# 上面的 na 不是空值，假设要把它当作空值，可以指定空值类型
missing_values = ["n/a", "na", "--"]

In [107]:
df = pd.read_csv('property-data.csv', na_values=missing_values)

In [108]:
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,NUM_BEDROOMS,NUM_BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3.0,1,1000.0
1,100002000.0,197.0,LEXINGTON,N,3.0,1.5,
2,100003000.0,,LEXINGTON,N,,1,850.0
3,100004000.0,201.0,BERKELEY,12,1.0,,700.0
4,,203.0,BERKELEY,Y,3.0,2,1600.0
5,100006000.0,207.0,BERKELEY,Y,,1,800.0
6,100007000.0,,WASHINGTON,,2.0,HURLEY,950.0
7,100008000.0,213.0,TREMONT,Y,1.0,1,
8,100009000.0,215.0,TREMONT,Y,,2,1800.0


In [109]:
test_column = df['NUM_BEDROOMS']

In [110]:
test_column

0    3.0
1    3.0
2    NaN
3    1.0
4    3.0
5    NaN
6    2.0
7    1.0
8    NaN
Name: NUM_BEDROOMS, dtype: float64

In [111]:
test_column.isnull()

0    False
1    False
2     True
3    False
4    False
5     True
6    False
7    False
8     True
Name: NUM_BEDROOMS, dtype: bool

### 剔除包含空值的行：df.dropna()

In [112]:
df = pd.read_csv('property-data.csv')

In [116]:
new_df = df.dropna()

In [117]:
new_df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,NUM_BEDROOMS,NUM_BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3,1.0,1000
1,100002000.0,197.0,LEXINGTON,N,3,1.5,--
8,100009000.0,215.0,TREMONT,Y,na,2.0,1800


In [118]:
# 默认情况下，dropna() 方法返回一个新的 DataFrame，不会修改源数据
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,NUM_BEDROOMS,NUM_BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3,1,1000
1,100002000.0,197.0,LEXINGTON,N,3,1.5,--
2,100003000.0,,LEXINGTON,N,,1,850
3,100004000.0,201.0,BERKELEY,12,1,,700
4,,203.0,BERKELEY,Y,3,2,1600
5,100006000.0,207.0,BERKELEY,Y,,1,800
6,100007000.0,,WASHINGTON,,2,HURLEY,950
7,100008000.0,213.0,TREMONT,Y,1,1,
8,100009000.0,215.0,TREMONT,Y,na,2,1800


### 修改获取的源数据：df.dropna(inplace=True)

In [119]:
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,NUM_BEDROOMS,NUM_BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3,1,1000
1,100002000.0,197.0,LEXINGTON,N,3,1.5,--
2,100003000.0,,LEXINGTON,N,,1,850
3,100004000.0,201.0,BERKELEY,12,1,,700
4,,203.0,BERKELEY,Y,3,2,1600
5,100006000.0,207.0,BERKELEY,Y,,1,800
6,100007000.0,,WASHINGTON,,2,HURLEY,950
7,100008000.0,213.0,TREMONT,Y,1,1,
8,100009000.0,215.0,TREMONT,Y,na,2,1800


In [120]:
new_df = df.dropna(inplace=True)

In [121]:
# 加了这个参数后，不会返回一个新的 DataFrame
new_df

In [122]:
# 但是会修改 df 的内容
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,NUM_BEDROOMS,NUM_BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3,1.0,1000
1,100002000.0,197.0,LEXINGTON,N,3,1.5,--
8,100009000.0,215.0,TREMONT,Y,na,2.0,1800


### 剔除指定列包括空值的行：df.dropna(subset=['列名']）

In [123]:
df = pd.read_csv('property-data.csv')

In [124]:
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,NUM_BEDROOMS,NUM_BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3,1,1000
1,100002000.0,197.0,LEXINGTON,N,3,1.5,--
2,100003000.0,,LEXINGTON,N,,1,850
3,100004000.0,201.0,BERKELEY,12,1,,700
4,,203.0,BERKELEY,Y,3,2,1600
5,100006000.0,207.0,BERKELEY,Y,,1,800
6,100007000.0,,WASHINGTON,,2,HURLEY,950
7,100008000.0,213.0,TREMONT,Y,1,1,
8,100009000.0,215.0,TREMONT,Y,na,2,1800


In [125]:
# PID 这一列中只有一行是 NaN，把它剔除
df.dropna(subset=['PID'], inplace=True)

In [126]:
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,NUM_BEDROOMS,NUM_BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3,1,1000
1,100002000.0,197.0,LEXINGTON,N,3,1.5,--
2,100003000.0,,LEXINGTON,N,,1,850
3,100004000.0,201.0,BERKELEY,12,1,,700
5,100006000.0,207.0,BERKELEY,Y,,1,800
6,100007000.0,,WASHINGTON,,2,HURLEY,950
7,100008000.0,213.0,TREMONT,Y,1,1,
8,100009000.0,215.0,TREMONT,Y,na,2,1800


### 替换空值：df.fillna('要替换成的内容')

In [133]:
df = pd.read_csv('property-data.csv')

In [134]:
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,NUM_BEDROOMS,NUM_BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3,1,1000
1,100002000.0,197.0,LEXINGTON,N,3,1.5,--
2,100003000.0,,LEXINGTON,N,,1,850
3,100004000.0,201.0,BERKELEY,12,1,,700
4,,203.0,BERKELEY,Y,3,2,1600
5,100006000.0,207.0,BERKELEY,Y,,1,800
6,100007000.0,,WASHINGTON,,2,HURLEY,950
7,100008000.0,213.0,TREMONT,Y,1,1,
8,100009000.0,215.0,TREMONT,Y,na,2,1800


In [135]:
# 第一种方式：将所有空值替换成指定的内容
df.fillna('test', inplace=True)

In [136]:
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,NUM_BEDROOMS,NUM_BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3,1,1000
1,100002000.0,197.0,LEXINGTON,N,3,1.5,--
2,100003000.0,test,LEXINGTON,N,test,1,850
3,100004000.0,201.0,BERKELEY,12,1,test,700
4,test,203.0,BERKELEY,Y,3,2,1600
5,100006000.0,207.0,BERKELEY,Y,test,1,800
6,100007000.0,test,WASHINGTON,test,2,HURLEY,950
7,100008000.0,213.0,TREMONT,Y,1,1,test
8,100009000.0,215.0,TREMONT,Y,na,2,1800


In [137]:
df = pd.read_csv('property-data.csv')

In [138]:
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,NUM_BEDROOMS,NUM_BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3,1,1000
1,100002000.0,197.0,LEXINGTON,N,3,1.5,--
2,100003000.0,,LEXINGTON,N,,1,850
3,100004000.0,201.0,BERKELEY,12,1,,700
4,,203.0,BERKELEY,Y,3,2,1600
5,100006000.0,207.0,BERKELEY,Y,,1,800
6,100007000.0,,WASHINGTON,,2,HURLEY,950
7,100008000.0,213.0,TREMONT,Y,1,1,
8,100009000.0,215.0,TREMONT,Y,na,2,1800


In [139]:
# 第二种方式：替换某一列的空值
df['PID'].fillna('test', inplace=True)

In [140]:
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,NUM_BEDROOMS,NUM_BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3,1,1000
1,100002000.0,197.0,LEXINGTON,N,3,1.5,--
2,100003000.0,,LEXINGTON,N,,1,850
3,100004000.0,201.0,BERKELEY,12,1,,700
4,test,203.0,BERKELEY,Y,3,2,1600
5,100006000.0,207.0,BERKELEY,Y,,1,800
6,100007000.0,,WASHINGTON,,2,HURLEY,950
7,100008000.0,213.0,TREMONT,Y,1,1,
8,100009000.0,215.0,TREMONT,Y,na,2,1800


In [196]:
df = pd.read_csv('property-data.csv')

In [197]:
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,NUM_BEDROOMS,NUM_BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3,1,1000
1,100002000.0,197.0,LEXINGTON,N,3,1.5,--
2,100003000.0,,LEXINGTON,N,,1,850
3,100004000.0,201.0,BERKELEY,12,1,,700
4,,203.0,BERKELEY,Y,3,2,1600
5,100006000.0,207.0,BERKELEY,Y,,1,800
6,100007000.0,,WASHINGTON,,2,HURLEY,950
7,100008000.0,213.0,TREMONT,Y,1,1,
8,100009000.0,215.0,TREMONT,Y,na,2,1800


In [198]:
# 第三种方式：替换指定某些列的空值
df.fillna({'PID': 'test', 'ST_NUM': 'test'}, inplace=True)

In [199]:
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,NUM_BEDROOMS,NUM_BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3,1,1000
1,100002000.0,197.0,LEXINGTON,N,3,1.5,--
2,100003000.0,test,LEXINGTON,N,,1,850
3,100004000.0,201.0,BERKELEY,12,1,,700
4,test,203.0,BERKELEY,Y,3,2,1600
5,100006000.0,207.0,BERKELEY,Y,,1,800
6,100007000.0,test,WASHINGTON,,2,HURLEY,950
7,100008000.0,213.0,TREMONT,Y,1,1,
8,100009000.0,215.0,TREMONT,Y,na,2,1800


### 计算某一列的平均值：df['列名'].mean()

In [141]:
df = pd.read_csv('property-data.csv')

In [142]:
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,NUM_BEDROOMS,NUM_BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3,1,1000
1,100002000.0,197.0,LEXINGTON,N,3,1.5,--
2,100003000.0,,LEXINGTON,N,,1,850
3,100004000.0,201.0,BERKELEY,12,1,,700
4,,203.0,BERKELEY,Y,3,2,1600
5,100006000.0,207.0,BERKELEY,Y,,1,800
6,100007000.0,,WASHINGTON,,2,HURLEY,950
7,100008000.0,213.0,TREMONT,Y,1,1,
8,100009000.0,215.0,TREMONT,Y,na,2,1800


In [143]:
x = df['ST_NUM'].mean()

In [144]:
x

191.42857142857142

In [145]:
# 把该列的空值全部用上面计算出的平均值替换
df['ST_NUM'].fillna(x, inplace=True)

In [146]:
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,NUM_BEDROOMS,NUM_BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3,1,1000
1,100002000.0,197.0,LEXINGTON,N,3,1.5,--
2,100003000.0,191.428571,LEXINGTON,N,,1,850
3,100004000.0,201.0,BERKELEY,12,1,,700
4,,203.0,BERKELEY,Y,3,2,1600
5,100006000.0,207.0,BERKELEY,Y,,1,800
6,100007000.0,191.428571,WASHINGTON,,2,HURLEY,950
7,100008000.0,213.0,TREMONT,Y,1,1,
8,100009000.0,215.0,TREMONT,Y,na,2,1800


### 计算某一列的中位数：df['列名'].median()

In [153]:
df = pd.read_csv('property-data.csv')

In [154]:
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,NUM_BEDROOMS,NUM_BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3,1,1000
1,100002000.0,197.0,LEXINGTON,N,3,1.5,--
2,100003000.0,,LEXINGTON,N,,1,850
3,100004000.0,201.0,BERKELEY,12,1,,700
4,,203.0,BERKELEY,Y,3,2,1600
5,100006000.0,207.0,BERKELEY,Y,,1,800
6,100007000.0,,WASHINGTON,,2,HURLEY,950
7,100008000.0,213.0,TREMONT,Y,1,1,
8,100009000.0,215.0,TREMONT,Y,na,2,1800


In [155]:
# 中位数是按顺序排列的一组数据中居于中间位置的数
# 计算有限个数的数据的中位数的方法是：把所有的同类数据按照大小的顺序排列
# 如果数据的个数是奇数，则中间那个数据就是这群数据的中位数
# 如果数据的个数是偶数，则中间那2个数据的算术平均值就是这群数据的中位数
x = df['ST_NUM'].median()

In [156]:
x

203.0

In [157]:
# 把该列的空值全部用中位数替换
df['ST_NUM'].fillna(x, inplace=True)

In [158]:
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,NUM_BEDROOMS,NUM_BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3,1,1000
1,100002000.0,197.0,LEXINGTON,N,3,1.5,--
2,100003000.0,203.0,LEXINGTON,N,,1,850
3,100004000.0,201.0,BERKELEY,12,1,,700
4,,203.0,BERKELEY,Y,3,2,1600
5,100006000.0,207.0,BERKELEY,Y,,1,800
6,100007000.0,203.0,WASHINGTON,,2,HURLEY,950
7,100008000.0,213.0,TREMONT,Y,1,1,
8,100009000.0,215.0,TREMONT,Y,na,2,1800


### 计算某一列的众数：df['列名'].mode()

In [163]:
df = pd.read_csv('property-data.csv')

In [164]:
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,NUM_BEDROOMS,NUM_BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3,1,1000
1,100002000.0,197.0,LEXINGTON,N,3,1.5,--
2,100003000.0,,LEXINGTON,N,,1,850
3,100004000.0,201.0,BERKELEY,12,1,,700
4,,203.0,BERKELEY,Y,3,2,1600
5,100006000.0,207.0,BERKELEY,Y,,1,800
6,100007000.0,,WASHINGTON,,2,HURLEY,950
7,100008000.0,213.0,TREMONT,Y,1,1,
8,100009000.0,215.0,TREMONT,Y,na,2,1800


In [165]:
# 众数是一组数据中出现次数最多的数值，有时众数在一组数中有好几个
x = df['ST_NUM'].mode()

In [166]:
x

0    104.0
1    197.0
2    201.0
3    203.0
4    207.0
5    213.0
6    215.0
dtype: float64

In [167]:
df['ST_NUM'].fillna(x, inplace=True)

In [168]:
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,NUM_BEDROOMS,NUM_BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3,1,1000
1,100002000.0,197.0,LEXINGTON,N,3,1.5,--
2,100003000.0,201.0,LEXINGTON,N,,1,850
3,100004000.0,201.0,BERKELEY,12,1,,700
4,,203.0,BERKELEY,Y,3,2,1600
5,100006000.0,207.0,BERKELEY,Y,,1,800
6,100007000.0,215.0,WASHINGTON,,2,HURLEY,950
7,100008000.0,213.0,TREMONT,Y,1,1,
8,100009000.0,215.0,TREMONT,Y,na,2,1800


### 格式化日期：pd.to_datetime()

In [184]:
d = {
  "Date": ['2020/12/01', '2020/12/02' , '20201226'],
  "duration": [50, 40, 45]
}

In [185]:
df = pd.DataFrame(d, index=['day1', 'day2', 'day3'])

In [186]:
df

Unnamed: 0,Date,duration
day1,2020/12/01,50
day2,2020/12/02,40
day3,20201226,45


In [187]:
df['Date'] = pd.to_datetime(df['Date'])

In [188]:
df

Unnamed: 0,Date,duration
day1,2020-12-01,50
day2,2020-12-02,40
day3,2020-12-26,45


### 替换符合条件的数据：df.loc[n, '列名'] = m

In [169]:
person = {
  "name": ['Google', 'Runoob' , 'Taobao'],
  "age": [50, 40, 12345]    # 12345 年龄数据是错误的
}

In [170]:
df = pd.DataFrame(person)

In [171]:
df

Unnamed: 0,name,age
0,Google,50
1,Runoob,40
2,Taobao,12345


In [172]:
df.loc[2, 'age'] = 30

In [173]:
df

Unnamed: 0,name,age
0,Google,50
1,Runoob,40
2,Taobao,30


In [174]:
# 设置条件语句，符合就替换
person = {
  "name": ['Google', 'Runoob' , 'Taobao'],
  "age": [50, 200, 12345]
}

In [175]:
df = pd.DataFrame(person)

In [176]:
df

Unnamed: 0,name,age
0,Google,50
1,Runoob,200
2,Taobao,12345


In [177]:
df.index

RangeIndex(start=0, stop=3, step=1)

In [178]:
for x in df.index:
  if df.loc[x, "age"] > 120:
    df.loc[x, "age"] = 120

In [179]:
df

Unnamed: 0,name,age
0,Google,50
1,Runoob,120
2,Taobao,120


### 剔除符合条件的数据：df.drop()

In [180]:
person = {
  "name": ['Google', 'Runoob' , 'Taobao'],
  "age": [50, 40, 12345]
}

In [181]:
df = pd.DataFrame(person)

In [182]:
for x in df.index:
  if df.loc[x, "age"] > 120:
    df.drop(x, inplace = True)

In [183]:
df

Unnamed: 0,name,age
0,Google,50
1,Runoob,40


### 判断重复数据：df.duplicated()

In [189]:
person = {
  "name": ['Google', 'Runoob', 'Runoob', 'Taobao'],
  "age": [50, 40, 40, 23]
}

In [190]:
df = pd.DataFrame(person)

In [191]:
df.duplicated()

0    False
1    False
2     True
3    False
dtype: bool

### 剔除重复的数据：df.drop_duplicates()

In [192]:
df

Unnamed: 0,name,age
0,Google,50
1,Runoob,40
2,Runoob,40
3,Taobao,23


In [193]:
df.drop_duplicates(inplace = True)

In [194]:
df

Unnamed: 0,name,age
0,Google,50
1,Runoob,40
3,Taobao,23


### 重命名列名：df.rename(columns={'旧列名':'新列名'})

In [200]:
df = pd.read_json('data.json')

In [201]:
df

Unnamed: 0,name,age,gender,score
0,Alice,25.0,female,80.0
1,Bob,,male,90.0
2,Charlie,30.0,male,
3,David,35.0,male,70.0


In [202]:
df = df.rename(columns={'name': '姓名', 'age': '年龄', 'gender': '性别', 'score': '成绩'})

In [203]:
df

Unnamed: 0,姓名,年龄,性别,成绩
0,Alice,25.0,female,80.0
1,Bob,,male,90.0
2,Charlie,30.0,male,
3,David,35.0,male,70.0


### 按某一列排序：df.sort_values(by='列名', ascending=False)

In [204]:
df

Unnamed: 0,姓名,年龄,性别,成绩
0,Alice,25.0,female,80.0
1,Bob,,male,90.0
2,Charlie,30.0,male,
3,David,35.0,male,70.0


In [207]:
new_df = df.sort_values(by='成绩', ascending=False)

In [208]:
new_df

Unnamed: 0,姓名,年龄,性别,成绩
1,Bob,,male,90.0
0,Alice,25.0,female,80.0
3,David,35.0,male,70.0
2,Charlie,30.0,male,


### 以某一列分组计算其他列的平均值：df.groupby('列名').agg({'列名': 'mean'})

In [209]:
df

Unnamed: 0,姓名,年龄,性别,成绩
0,Alice,25.0,female,80.0
1,Bob,,male,90.0
2,Charlie,30.0,male,
3,David,35.0,male,70.0


In [210]:
grouped = df.groupby('性别').agg({'年龄': 'mean', '成绩': 'mean'})

In [211]:
grouped

Unnamed: 0_level_0,年龄,成绩
性别,Unnamed: 1_level_1,Unnamed: 2_level_1
female,25.0,80.0
male,32.5,80.0


### 获取符合条件的并只保留某n列：df.loc['条件', ['列名', ...]]

In [212]:
df

Unnamed: 0,姓名,年龄,性别,成绩
0,Alice,25.0,female,80.0
1,Bob,,male,90.0
2,Charlie,30.0,male,
3,David,35.0,male,70.0


In [213]:
new_df = df.loc[df['成绩'] >= 90, ['姓名', '成绩']]

In [214]:
new_df

Unnamed: 0,姓名,成绩
1,Bob,90.0


### 获取每列的基本统计信息：df.describe()

In [215]:
df

Unnamed: 0,姓名,年龄,性别,成绩
0,Alice,25.0,female,80.0
1,Bob,,male,90.0
2,Charlie,30.0,male,
3,David,35.0,male,70.0


In [216]:
stats = df.describe()

In [217]:
stats

Unnamed: 0,年龄,成绩
count,3.0,3.0
mean,30.0,80.0
std,5.0,10.0
min,25.0,70.0
25%,27.5,75.0
50%,30.0,80.0
75%,32.5,85.0
max,35.0,90.0


### 计算某列的平均值：df.mean()

In [218]:
df

Unnamed: 0,姓名,年龄,性别,成绩
0,Alice,25.0,female,80.0
1,Bob,,male,90.0
2,Charlie,30.0,male,
3,David,35.0,male,70.0


In [220]:
mean = df['年龄'].mean()

In [221]:
mean

30.0

### 计算某列的中位数：df.median()

In [222]:
df

Unnamed: 0,姓名,年龄,性别,成绩
0,Alice,25.0,female,80.0
1,Bob,,male,90.0
2,Charlie,30.0,male,
3,David,35.0,male,70.0


In [223]:
mode = df['年龄'].mode()

In [224]:
mode

0    25.0
1    30.0
2    35.0
dtype: float64

### 计算每列非空值的数量：df.count()

In [225]:
df

Unnamed: 0,姓名,年龄,性别,成绩
0,Alice,25.0,female,80.0
1,Bob,,male,90.0
2,Charlie,30.0,male,
3,David,35.0,male,70.0


In [226]:
count = df.count()

In [227]:
count

姓名    4
年龄    3
性别    4
成绩    3
dtype: int64